In [2]:
#import wave
#import pydub
import numpy as np
import pandas as pd
import librosa
import os
import matplotlib.pyplot as plt
import keras
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

Using TensorFlow backend.


In [3]:
#!pip install pyaudio
#!pip3 install librosa
#!pip3 install numpy
#!pip3 install ffmpeg
#!pip3 install keras
#!pip3 install tensorflow

### Wave Trial

In [None]:
s = wave.open("1;Petukhov V;25;M;.wav", "rb")
(nchannels, sampwidth, framerate, nframes, comptype, compname) = s.getparams()
content = s.readframes(nframes)
types = {
    1: np.int8,
    2: np.int16,
    4: np.int32,
    8: np.int64,
}
samples = np.fromstring(content, dtype = types[sampwidth])
for n in range(nchannels):
    channel = samples[n::nchannels]
duration = nframes / framerate 
w, h = 20000, 300
DPI = 72
peak = 256 ** sampwidth / 2
k = nframes/w/32

In [None]:
plt.figure(1, figsize=(20,7), dpi=DPI)
plt.plot(samples, "g")
plt.show()

In [None]:
amplitude = 300; 
freq_Hz = 500; 
data = []
n = 2000
for i in range(n):
    data.append(int(amplitude*100000000000* np.sin((2*np.pi*i*freq_Hz))/256)+123)
test = wave.open("test.wav","w")

test.setnframes(n)
test.setnchannels(1)
test.setsampwidth(4)
test.setframerate(1)

#for i in range(len(data)):
#    data[i] = bytes(data[i])
data = bytearray(data)
test.writeframes(data)

### Librosa trial

In [63]:
def splitting(name):
    return [os.path.splitext(name)[1]],  os.path.splitext(name)[0].split(';')[:-1]

In [64]:
#Каталог из которого будем брать файлы
directory = './Voices/'
#Получаем список файлов в переменную files
files = os.listdir(directory) 

In [65]:
#cut the middle values and make 10 audios from 1
lst = []
for i in range(len(files)):
    if files[i][0]=='1' or files[i][0]=='2':
        y, sr = librosa.load(directory+files[i])
        extension, name = splitting(files[i])
        interval = (len(y)-110584)//2
        for j in range(10):
            constant = 110584//10
            lst.append(name+extension+[y[interval+constant*j:interval+constant*(j+1)]]+[sr])

In [66]:
data = pd.DataFrame(lst, columns=['Num', 'Name', 'Age', 'Gender','Ext','Audio','Sr'])
data.head()

Unnamed: 0,Num,Name,Age,Gender,Ext,Audio,Sr
0,2,Kupriyanov K,20,M,.wav,"[0.004811491, 0.005607869, 0.00039207924, -0.0...",22050
1,2,Kupriyanov K,20,M,.wav,"[-0.000522882, -0.0007269805, -0.0009985598, -...",22050
2,2,Kupriyanov K,20,M,.wav,"[0.0052018664, 0.0052809045, 0.005386425, 0.00...",22050
3,2,Kupriyanov K,20,M,.wav,"[-0.0022265846, -0.0009779317, -0.0017107826, ...",22050
4,2,Kupriyanov K,20,M,.wav,"[-0.007638506, 0.0033009544, 0.00832715, 0.000...",22050


In [67]:
data.Name = data.Name.astype('category')
data.Name = data.Name.cat.rename_categories([i for i in range(data.Name.nunique())])

In [68]:
list_of_mfccs = []
for i in range(len(data)):
    mfcc = librosa.feature.mfcc(data.Audio.iloc[i], data.Sr.iloc[i])
    list_of_mfccs.append(np.array(mfcc).ravel())
data['mfcc'] = list_of_mfccs

In [73]:
list_of_ffts = []
for elem in data.Audio:
    d = np.abs(librosa.stft(elem, n_fft = 1024))
    list_of_ffts.append(np.array(d)) 
data['fft'] = list_of_ffts

In [74]:
list_of_rms = []
for elem in data.Audio:
    rms = librosa.feature.rmse(elem)
    list_of_rms.append(np.array(rms).ravel())
data['rms'] = list_of_rms

In [75]:
contrasts = []
for i in range(len(data)):
    contrast = librosa.feature.spectral_contrast(S=data.fft.iloc[i], sr=data.Sr.iloc[i])
    contrasts.append(np.array(contrast).ravel())
data['contrast'] = contrasts

In [76]:
hop_length = 512
tempos = []
for i in range(len(data)):
    oenv = librosa.onset.onset_strength(y=data.Audio.iloc[i], sr=data.Sr.iloc[i], hop_length=hop_length)
    tempo = librosa.feature.tempogram(onset_envelope=oenv, sr=data.Sr.iloc[i],
                                      hop_length=hop_length)
    tempos.append(np.array(tempo).ravel())
data['tempo'] = tempos

In [77]:
data.head()

Unnamed: 0,Num,Name,Age,Gender,Ext,Audio,Sr,mfcc,fft,rms,contrast,tempo
0,2,28,20,M,.wav,"[0.004811491, 0.005607869, 0.00039207924, -0.0...",22050,"[-392.60904471196284, -420.74703399901244, -46...","[[0.008559212, 0.19821203, 0.39636698, 0.17321...","[0.0032237044, 0.0031156135, 0.0057458566, 0.0...","[14.853753822270157, 5.553359092960635, 11.170...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,2,28,20,M,.wav,"[-0.000522882, -0.0007269805, -0.0009985598, -...",22050,"[-582.9669128712184, -496.9997666440283, -430....","[[0.21886599, 0.15453674, 0.100143924, 0.12747...","[0.002081032, 0.0047898716, 0.007908231, 0.008...","[12.011717169857842, 8.457228673149533, 12.063...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,2,28,20,M,.wav,"[0.0052018664, 0.0052809045, 0.005386425, 0.00...",22050,"[-470.5997279027319, -468.4096942575848, -494....","[[0.025064165, 0.059207693, 0.11918354, 0.0598...","[0.007283517, 0.006809681, 0.0066736103, 0.006...","[17.419322443422303, 17.362592671266533, 17.73...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,2,28,20,M,.wav,"[-0.0022265846, -0.0009779317, -0.0017107826, ...",22050,"[-427.69209013367464, -415.1758321984228, -422...","[[0.26673716, 0.2506916, 0.24540073, 0.3054897...","[0.01095216, 0.010759305, 0.012502214, 0.01382...","[19.876196873884485, 4.860169441196324, 15.027...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,2,28,20,M,.wav,"[-0.007638506, 0.0033009544, 0.00832715, 0.000...",22050,"[-457.09772306219605, -466.00725013257824, -49...","[[0.18701445, 0.28751963, 0.3799873, 0.3511352...","[0.009292727, 0.00826182, 0.006643449, 0.00445...","[31.86509560932439, 9.974674727650031, 14.7088...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [78]:
#data.Gender = (df.Gender.values > 0).astype(np.uint8)

In [79]:
data_hayam = data[data['Num'] != '2']

In [80]:
data_hayam.head()

Unnamed: 0,Num,Name,Age,Gender,Ext,Audio,Sr,mfcc,fft,rms,contrast,tempo
40,1,43,39,M,.wav,"[-0.0045401547, -0.0048513385, -0.004792961, -...",22050,"[-475.1810561875608, -454.1726019057168, -441....","[[1.4174968, 0.11539252, 0.69051325, 0.2283472...","[0.009071489, 0.009567424, 0.010832742, 0.0128...","[14.788129464962642, 10.61355615500672, 6.0912...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
41,1,43,39,M,.wav,"[0.009330407, 0.017106444, 0.023515461, 0.0304...",22050,"[-341.4915287926109, -379.5716713687056, -467....","[[3.5195684, 1.7047472, 0.6129912, 0.5455634, ...","[0.026420018, 0.025998585, 0.019436995, 0.0087...","[8.598162346849827, 9.483578215944494, 13.4864...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
42,1,43,39,M,.wav,"[0.017703088, 0.016833756, 0.016947305, 0.0157...",22050,"[-434.28852632255894, -453.62069190385296, -46...","[[0.43781066, 0.46806946, 0.43156293, 0.694807...","[0.0111753745, 0.010956885, 0.008738491, 0.005...","[21.05849872677106, 8.305093226519833, 7.59029...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
43,1,43,39,M,.wav,"[-0.00021349241, 0.00037832817, 0.0006367343, ...",22050,"[-321.5987738799005, -323.13387181901504, -364...","[[3.064468, 1.1777807, 0.2540468, 0.8278696, 0...","[0.02108208, 0.020712152, 0.015445695, 0.00668...","[27.68438790040198, 7.743812600138406, 10.1792...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
44,1,43,39,M,.wav,"[-0.024480619, -0.025467385, -0.025523111, -0....",22050,"[-383.88844399909004, -379.81767380730855, -38...","[[1.1600274, 0.8318858, 0.18485467, 0.1259521,...","[0.029184557, 0.029250154, 0.026800081, 0.0240...","[9.424001550094172, 5.1869929855896295, 12.730...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [81]:
data_turtle = data[data['Num'] == '2']
data_turtle.head()

Unnamed: 0,Num,Name,Age,Gender,Ext,Audio,Sr,mfcc,fft,rms,contrast,tempo
0,2,28,20,M,.wav,"[0.004811491, 0.005607869, 0.00039207924, -0.0...",22050,"[-392.60904471196284, -420.74703399901244, -46...","[[0.008559212, 0.19821203, 0.39636698, 0.17321...","[0.0032237044, 0.0031156135, 0.0057458566, 0.0...","[14.853753822270157, 5.553359092960635, 11.170...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,2,28,20,M,.wav,"[-0.000522882, -0.0007269805, -0.0009985598, -...",22050,"[-582.9669128712184, -496.9997666440283, -430....","[[0.21886599, 0.15453674, 0.100143924, 0.12747...","[0.002081032, 0.0047898716, 0.007908231, 0.008...","[12.011717169857842, 8.457228673149533, 12.063...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,2,28,20,M,.wav,"[0.0052018664, 0.0052809045, 0.005386425, 0.00...",22050,"[-470.5997279027319, -468.4096942575848, -494....","[[0.025064165, 0.059207693, 0.11918354, 0.0598...","[0.007283517, 0.006809681, 0.0066736103, 0.006...","[17.419322443422303, 17.362592671266533, 17.73...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,2,28,20,M,.wav,"[-0.0022265846, -0.0009779317, -0.0017107826, ...",22050,"[-427.69209013367464, -415.1758321984228, -422...","[[0.26673716, 0.2506916, 0.24540073, 0.3054897...","[0.01095216, 0.010759305, 0.012502214, 0.01382...","[19.876196873884485, 4.860169441196324, 15.027...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,2,28,20,M,.wav,"[-0.007638506, 0.0033009544, 0.00832715, 0.000...",22050,"[-457.09772306219605, -466.00725013257824, -49...","[[0.18701445, 0.28751963, 0.3799873, 0.3511352...","[0.009292727, 0.00826182, 0.006643449, 0.00445...","[31.86509560932439, 9.974674727650031, 14.7088...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [82]:
target = data.Name
target_hayam = data_hayam.Name
target_turtle = data_turtle.Name

In [88]:
features = ['Audio', 'Sr', 'mfcc', 'rms', 'contrast', 'tempo']

In [105]:
data_turtle['Audio']

(11058,)

In [106]:
new_data = pd.DataFrame()
n = data_turtple['Audio'][0].shape[0]
for i in data_turtple['Audio']:
    temp = np.zeros((n, ))
    

In [None]:
for i in 

In [None]:
for feature in features:
    

In [87]:
data_turtle.head()

Unnamed: 0,Num,Name,Age,Gender,Ext,Audio,Sr,mfcc,fft,rms,contrast,tempo
0,2,28,20,M,.wav,"[0.004811491, 0.005607869, 0.00039207924, -0.0...",22050,"[-392.60904471196284, -420.74703399901244, -46...","[[0.008559212, 0.19821203, 0.39636698, 0.17321...","[0.0032237044, 0.0031156135, 0.0057458566, 0.0...","[14.853753822270157, 5.553359092960635, 11.170...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,2,28,20,M,.wav,"[-0.000522882, -0.0007269805, -0.0009985598, -...",22050,"[-582.9669128712184, -496.9997666440283, -430....","[[0.21886599, 0.15453674, 0.100143924, 0.12747...","[0.002081032, 0.0047898716, 0.007908231, 0.008...","[12.011717169857842, 8.457228673149533, 12.063...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,2,28,20,M,.wav,"[0.0052018664, 0.0052809045, 0.005386425, 0.00...",22050,"[-470.5997279027319, -468.4096942575848, -494....","[[0.025064165, 0.059207693, 0.11918354, 0.0598...","[0.007283517, 0.006809681, 0.0066736103, 0.006...","[17.419322443422303, 17.362592671266533, 17.73...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,2,28,20,M,.wav,"[-0.0022265846, -0.0009779317, -0.0017107826, ...",22050,"[-427.69209013367464, -415.1758321984228, -422...","[[0.26673716, 0.2506916, 0.24540073, 0.3054897...","[0.01095216, 0.010759305, 0.012502214, 0.01382...","[19.876196873884485, 4.860169441196324, 15.027...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,2,28,20,M,.wav,"[-0.007638506, 0.0033009544, 0.00832715, 0.000...",22050,"[-457.09772306219605, -466.00725013257824, -49...","[[0.18701445, 0.28751963, 0.3799873, 0.3511352...","[0.009292727, 0.00826182, 0.006643449, 0.00445...","[31.86509560932439, 9.974674727650031, 14.7088...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [40]:
def making_train(data, columns):
    train = pd.DataFrame()
    for elem in columns:
        temp = []
        for i in range(len(data)):
            temp.append(np.array(data[elem].iloc[i]).ravel())
        print(temp[:10])
        train[elem]=temp
    return train
 #   flattened_ffts = []
 #   for i in range(len(list_of_ffts)):
 #       flattened_ffts.append(np.array(list_of_ffts[i]).ravel())
 #   flattened_mfccs = []
 #   for i in range(len(list_of_mfccs)):
 #       flattened_mfccs.append(np.array(list_of_mfccs[i]).ravel())
 #   return list_of_ffts, flattened_ffts, list_of_mfccs, flattened_mfccs

In [41]:
train_turtle = making_train(data_turtle, ['tempo'])

(550, 8448)


In [57]:
temp = np.zeros((550, len(data.Audio.loc[1])))

In [62]:
data['Audio'].values

array([array([ 4.8114909e-03,  5.6078690e-03,  3.9207924e-04, ...,
       -1.0595837e-05,  1.5493295e-05, -1.6463321e-04], dtype=float32),
       array([-0.00052288, -0.00072698, -0.00099856, ...,  0.0069864 ,
        0.00632424,  0.00561388], dtype=float32),
       array([ 0.00520187,  0.0052809 ,  0.00538643, ..., -0.00223495,
       -0.00150062, -0.00175715], dtype=float32),
       ...,
       array([-0.00350916, -0.00176717, -0.00370401, ...,  0.00962943,
        0.01032985,  0.00814421], dtype=float32),
       array([0.01085237, 0.01269999, 0.01360318, ..., 0.00247334, 0.0023707 ,
       0.00350196], dtype=float32),
       array([0.00428223, 0.00388586, 0.00453904, ..., 0.00998351, 0.0109589 ,
       0.02475655], dtype=float32)], dtype=object)

In [48]:
d = train_turtle.values

In [52]:
d[0]

(550, 1)

In [42]:

for name in train.columns:
    column_names = [name+str(i) for i in range(len(train_turtle[name][1]))]
    

(550, 1)

In [33]:
train_turtle.head()

Unnamed: 0,tempo
0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [24]:
train_turtle.iloc[0]

tempo    [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...
Name: 0, dtype: object

In [304]:
#print(np.mean([len(elem) for elem in data.Audio]))
#print(len(data.Audio[1]))

In [305]:
#list_of_ffts = []
#for elem in data.Audio:
#    d = np.abs(librosa.stft(elem))
#    list_of_ffts.append(d)
#mfcc = librosa.feature.mfcc(data.Audio[1], data.Sr[1])

In [313]:
#flattened_ffts_turtle, flattened_mfccs_turtle = making_features(data_turtle)

550


In [293]:
#import itertools

In [341]:
#train_lst = []
#for i in range(len(flattened_ffts_turtle)):
#    train_lst.append(list(itertools.chain(flattened_ffts_turtle[i], flattened_mfccs_turtle[i])))

In [343]:
#column_names = ['fft_'+str(i) for i in range(len(flattened_ffts_turtle[1]))]+ ['mfcc_'+str(i) for i in range(len(flattened_mfccs_turtle[1]))]

In [344]:
#new_train_df = pd.DataFrame(train_lst, columns=column_names)

In [186]:
from keras.layers import Convolution2D, MaxPooling2D, Conv2D
from keras.layers import Dense, Dropout, Activation, Flatten
import tensorflow as tf

In [187]:
model = keras.models.Sequential()
input_shape = (1025,22,1)
model.add(Conv2D(512, kernel_size=(5,5), input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(256, kernel_size=(5,5), input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten()) # Flattening the 2D arrays for fully connected layers
model.add(Dense(500, activation=tf.nn.relu))
model.add(Dropout(0.2))
model.add(Dense(55,activation=tf.nn.softmax)) 

In [159]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) 

In [296]:
X_train, X_test, y_train, y_test = train_test_split(flattened_ffts, target, test_size=0.1, shuffle=True)

In [297]:
X_train[5].shape

(22550,)

In [166]:
model.fit(x=X_train,y=y_train, epochs=10, batch_size=10) 

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 990 arrays: [array([[2.3632175e-01, 1.1798878e-01, 1.2653836e-02, ..., 9.9964269e-02,
        9.6054658e-02, 3.7929958e-01],
       [3.0732772e-01, 1.3116026e-01, 3.4217160e-02, ..., 6.0841173e-01,
        4.4395...

In [243]:
#X_train_hayam, X_test_hayam, y_train_hayam, y_test_hayam = train_test_split(
#    flattened_ffts_hayam, target_hayam, test_size=0.1, shuffle=True)

In [34]:
X_train_turtle, X_test_turtle, y_train_turtle, y_test_turtle = train_test_split(
    train_turtle.values, target_turtle, test_size=0.2, shuffle=True)

In [362]:
#clf = SVC(kernel='linear')
#clf.fit(X_train_hayam, y_train_hayam)
#pred_hayam = clf.predict(X_test_hayam)
#print(accuracy_score(pred_hayam, y_test_hayam))

In [35]:
clf = SVC(kernel='linear')
clf.fit(X_train_turtle, y_train_turtle)
pred_turtle = clf.predict(X_test_turtle)
print(accuracy_score(pred_turtle, y_test_turtle))

ValueError: setting an array element with a sequence.

In [28]:
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train_turtle, y_train_turtle)
pred_turtle = clf.predict(X_test_turtle)
print(accuracy_score(pred_turtle, y_test_turtle))

ValueError: setting an array element with a sequence.

In [353]:
features_list = sorted(list(zip(clf.feature_importances_, column_names)), reverse=True)

In [359]:
best_data_columns = pd.DataFrame()
i=0
while (features_list[i][0]>0.00001):
    best_data_columns[features_list[i][1]] = new_train_df[features_list[i][1]]
    i+=1

In [360]:
best_data_columns.shape

(550, 15153)

In [300]:
clf = LogisticRegression()
clf.fit(X_train_turtle, y_train_turtle)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [301]:
pred_turtle = clf.predict(X_test_turtle)
print(accuracy_score(pred_turtle, y_test_turtle))

0.21818181818181817


In [None]:

#plt.figure(1, figsize=(20,7), dpi=DPI)
#plt.plot(y, "g")
#plt.show()
#if it is necessary to make more samples
#resampling_file = []
#l = len(y)//40
#for i in range(1,40):
#    resampling_file.append(y[(i-1)*l:i*l])
#print(len(resampling_file))

In [None]:
'''plt.figure(1, figsize=(20,15), dpi=DPI)
D = []
for i in range(len(resampling_file)-1):
    d = np.abs(librosa.stft(resampling_file[i]))
    plt.subplot(6, 7, i+1)
    plt.plot(d, "r")
    D.append(d)
plt.show()'''

In [None]:
for i in D:
    print(i.shape)

In [None]:
D.shape

In [None]:
# 3. Run the default beat tracker
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)

print('Estimated tempo: {:.2f} beats per minute'.format(tempo))

# 4. Convert the frame indices of beat events into timestamps
beat_times = librosa.frames_to_time(beat_frames, sr=sr)

print('Saving output to beat_times.csv')
librosa.output.times_csv('beat_times.csv', beat_times)