In [1]:
# SIGNATEコンペ 特有のコード

In [2]:
# ライブラリの読み込み
import numpy as np
import pandas as pd

# keras
import keras
#from keras.datasets import mnist
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout,Input,Flatten,Activation
from keras.layers import Conv1D,Conv2D,MaxPooling1D,BatchNormalization,Reshape
from keras.layers.recurrent import LSTM
from keras.optimizers import Adam, SGD
from keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint,EarlyStopping
from keras.models import Model

import tensorflow as tf
import pickle

## 音声解析用
import scipy.signal as ss
import librosa
import librosa.display

from Util import NormalizeHorizontalDirection

Using TensorFlow backend.


In [3]:
# 音声データのData Augmentation
def change_speed_and_pitch(x,sr):
    y_pitch_speed = x.copy()
    # you can change low and high here
    length_change = np.random.uniform(low=0.5,high=1.5)
    speed_fac = 1.0  / length_change
    print("resample length_change = ",length_change)
    tmp = np.interp(np.arange(0,len(y_pitch_speed),speed_fac),np.arange(0,len(y_pitch_speed)),y_pitch_speed)
    minlen = min(y_pitch_speed.shape[0], tmp.shape[0])
    y_pitch_speed *= 0
    y_pitch_speed[0:minlen] = tmp[0:minlen]
#    ipd.Audio(y_pitch_speed, rate=sr)
    return y_pitch_speed

In [4]:
def change_pitch(x,sr):
    y_pitch = x.copy()
    bins_per_octave = 24
    pitch_pm = 4
    pitch_change =  pitch_pm * 2*(np.random.uniform()-0.5)   
    print("pitch_change = ",pitch_change)
    y_pitch = librosa.effects.pitch_shift(y_pitch.astype('float64'), 
                                          sr, n_steps=pitch_change, 
                                          bins_per_octave=bins_per_octave)
    return (y_pitch)

In [5]:
def change_speed(x,sr):
    y_speed = x.copy()
    speed_change = np.random.uniform(low=0.7,high=1.3)
    print("speed_change = ",speed_change)
    tmp = librosa.effects.time_stretch(y_speed.astype('float64'), speed_change)
    minlen = min(y_speed.shape[0], tmp.shape[0])
    y_speed *= 0 
    y_speed[0:minlen] = tmp[0:minlen]    
    return y_speed

In [6]:
def add_noise(x,sr):
    y_noise = x.copy()
    # you can take any distribution from https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.random.html
    noise = 0.005
    noise_amp = noise*np.random.uniform()*np.amax(y_noise)
    y_noise = y_noise.astype('float64') + noise_amp * np.random.normal(size=y_noise.shape[0])
#    ipd.Audio(y_noise, rate=sr)
    return y_noise

In [7]:
def slightly_timeshift(x,sr):
    y_shift = x.copy()
    change_length_rate = 0.1 # up to 10% of length
    timeshift_fac = change_length_rate *2*(np.random.uniform()-0.5)
    print("timeshift_fac = ",timeshift_fac)
    start = int(y_shift.shape[0] * timeshift_fac)
    print(start)
    if (start > 0):
        y_shift = np.pad(y_shift,(start,0),mode='constant')[0:y_shift.shape[0]]
    else:
        y_shift = np.pad(y_shift,(0,-start),mode='constant')[0:y_shift.shape[0]]
    return y_shift

In [8]:
def stfft(extract_x,sr):
    frame_bin = int(sr * (10 ** (-3)))
    f, t, Zxx = ss.stft(extract_x,fs = sr,
                    window = ss.get_window("hamming",frame_bin * 25),# 25ms
                    nperseg = frame_bin * 25, 
                    noverlap = frame_bin * 15,# 15msずつoverlapしていることになる    
                    nfft = 1023 # ここが1024で動くのがよくわからん。
                   )
    spectral = np.abs(Zxx)
    #周波数方向に正規化
    #Zxx_norm = calc_zscore(spectral)
    #正規化は秒数で抽出してから。
    return((f,t,spectral))

In [9]:
def zero_padding(spectrogram,thre=500):
    n_col = spectrogram.shape[1]
    if n_col >= thre:
        # 音声がデフォルトだと500ms以上の場合
        # data augmentationでsligntly shftを導入したので、lbのランダム化はやめる
#        lb =  np.random.randint(0,thre - n_col + 1)
        lb = 0
        ub = thre + lb
        spectrogram_pad = spectrogram[:,lb:ub]
        
    else:
        # lbをランダムに設定
        # lb = np.random.randint(0,thre - n_col + 1)
        lb = 0
        ub = thre - n_col - lb
        spectrogram_pad = np.pad(spectrogram,((0,0),(lb,ub)),"constant")

    return(spectrogram_pad)

In [10]:
def calc_spectrogram(filepath,lb_ms=0,ub_ms=500,speed=False,pitch=False,time_shift=False,noise=False):
    # データの読み込み
    x,sr = librosa.load(filepath)
    
    if(speed and pitch):
        # 音声のスピードとピッチを変化
        print("change speed and pitch")
        x = change_speed_and_pitch(x,sr)

    elif(speed):
        # 音声のスピードを変化
        print("change speed")
        x = change_speed(x,sr)
        
    elif(pitch):
        # 音声のピッチを変化
        print("change pitch")
        x = change_pitch(x,sr)
    
    if(time_shift):
        # 時間方向に音声を少しずらす
        print("time_shift")
        x = slightly_timeshift(x,sr)
    
    if(noise):
        # 音声にノイズを添加する
        print("add noise")
        x = add_noise(x,sr)
                        
    ft,t,spectrogram = stfft(x,sr)
    # TODO : 周波数方向のnormalizationを追加
    spectrogram_norm = NormalizeHorizontalDirection(spectrogram[:,lb_ms:ub_ms])
    
    spectrogram_norm_pad = zero_padding(spectrogram_norm,thre=500)
    
    # TODO : zero padding
    return(spectrogram_norm_pad)

In [59]:
class SpectrogramGenerator(keras.callbacks.Callback):
    def __init__(self,lb,ub,train_info,speaker_dict,batch_size,time_len,test_info,milestones):
#        self.reset()        
        self.lb = lb
        self.ub = ub
        #self.train_info = train_info
        # 学習する際は__init__の時点でcalc_spectrogram_funcを設定する必要はない。
        self.calc_spectrogram_func = lambda filepath: calc_spectrogram(filepath,
                                                               lb_ms=lb,ub_ms=ub,
                                                               speed=False,pitch=False,
                                                               time_shift=False,noise=False)
        
        X_train,X_val,y_train,y_val = train_test_split(np.array(train_info["filepath"]),
                                                 np.array(train_info["label"]),
                                                            test_size=0.1,
                                                random_state = 1234)
        
        self.X_train = X_train
        self.X_val = X_val
        self.y_train = y_train
        self.y_val = y_val
        self.speaker_dict = speaker_dict
        self.batch_size = batch_size
        self.time_len = time_len
        
        # step_per_epoch        
        self.train_step_per_epoch = int(np.ceil(len(X_train) / batch_size))
        self.val_step_per_epoch = int(np.ceil(len(X_val) / batch_size))
        
        # テストデータ
        X_test = np.array(test_info["filepath"])
        self.X_test = X_test
        # y_testはダミーを加えておく
        self.y_test = np.repeat("FE_AD",len(test_info["filepath"]))
        self.test_step_per_epoch = int(np.ceil(len(X_test) / batch_size))        
        
        ## カリキュラム学習
        self.milestones = milestones
        
    def reset(self):
        self.spectrograms = []
        self.labels = []

    def next_train(self):
        ## FIXME : lb_msとub_msはmili secondではない。milisecond * 10*-1
        self.reset()
        num_classes = len(self.speaker_dict)
        
        while True:
            # train_test_splitでshuffleしているが、epoch毎に学習順を変更
            indexes = np.random.permutation(len(self.X_train))
            X = self.X_train[indexes]
            y = self.y_train[indexes]
                
            for tmp_X,tmp_y in zip(X,y):
#                spectrogram = np.zeros([512,500])
#                print(tmp_X)
                spectrogram = self.calc_spectrogram_func(tmp_X)

                
                target = to_categorical(self.speaker_dict[tmp_y],num_classes=num_classes)
                
                self.spectrograms.append(spectrogram)
                self.labels.append(target)
                
                if len(self.labels) == self.batch_size:
                    inputs = np.asarray(self.spectrograms, dtype=np.float32).reshape(self.batch_size,
                                                                                   spectrogram.shape[0],
                                                                                    spectrogram.shape[1],1)
                    targets = np.asarray(self.labels, dtype=np.float32)
                    
                    assert len(targets) == self.batch_size, "incorect target shape"
                    
                    self.reset()
                    yield (inputs,targets)
                    
    def next_val(self):
        self.reset()
        ## FIXME : lb_msとub_msはmili secondではない。milisecond * 10*-1
        num_classes = len(self.speaker_dict)
        
        while True:
#            indexes = np.random.permutation(len(self.X_val))
#            X = self.X_train[indexes]
#            y = self.y_train[indexes]
                
            for tmp_X,tmp_y in zip(self.X_val,self.y_val):
#                print(tmp_X)
                spectrogram = self.calc_spectrogram_func(tmp_X)
                
                target = to_categorical(self.speaker_dict[tmp_y],num_classes=num_classes)
                
                self.spectrograms.append(spectrogram)
                self.labels.append(target)
                
                if len(self.labels) == self.batch_size:
                    inputs = np.asarray(self.spectrograms, dtype=np.float32).reshape(self.batch_size,
                                                                                   spectrogram.shape[0],
                                                                                    spectrogram.shape[1],1)
                    targets = np.asarray(self.labels, dtype=np.float32)
                    self.reset()
                    yield (inputs,targets)
        
    def test(self):
        self.reset()
        ## FIXME : lb_msとub_msはmili secondではない。milisecond * 10*-1
        num_classes = len(self.speaker_dict)
        
        while True:                
            for tmp_X,tmp_y in zip(self.X_test,self.y_test):
            # テストデータではdata augmentationはしない
#                print(tmp_X)
                spectrogram = calc_spectrogram(tmp_X,
                                               lb_ms=self.lb,ub_ms=self.ub,
                                               speed=False,pitch=False,
                                               time_shift=False,noise=False)
                
                target = to_categorical(self.speaker_dict[tmp_y],num_classes=num_classes)
                
                self.spectrograms.append(spectrogram)
                self.labels.append(target)
                
                if len(self.labels) == self.batch_size:
                    inputs = np.asarray(self.spectrograms, dtype=np.float32).reshape(self.batch_size,
                                                                                   spectrogram.shape[0],
                                                                                    spectrogram.shape[1],1)
                    targets = np.asarray(self.labels, dtype=np.float32)
                    self.reset()
                    yield (inputs,targets)
                
    
    def on_train_begin(self):
        self.reset()
        self.calc_spectrogram_func = lambda filepath: calc_spectrogram(filepath,
                                                               lb_ms=self.lb,ub_ms=self.ub,
                                                               speed=False,pitch=False,
                                                               time_shift=False,noise=False)
            
    def on_epoch_begin(self):
        # カリキュラム学習を導入 (学習が進むごとにData Augmentationを導入する)
#        milestones = [1,2,3,4,5]
        
        if epoch <= self.milestones[0]:
        # data augmentationなし
            self.calc_spectrogram_func = lambda filepath: calc_spectrogram(filepath,
                                                                   lb_ms=self.lb,ub_ms=self.ub,
                                                                   speed=False,pitch=False,
                                                                   time_shift=False,noise=False)
        elif epoch <= self.milestones[1]:
        # time shift
            self.calc_spectrogram_func = lambda filepath: calc_spectrogram(filepath,
                                                                   lb_ms=self.lb,ub_ms=self.ub,
                                                                   speed=False,pitch=False,
                                                                   time_shift=True,noise=False)        
        elif epoch <= self.milestones[2]:
        # speed 変更
            self.calc_spectrogram_func = lambda filepath: calc_spectrogram(filepath,
                                                                   lb_ms=self.lb,ub_ms=self.ub,
                                                                   speed=True,pitch=False,
                                                                   time_shift=True,noise=False)
        elif epoch <= self.milestones[3]:
        # pitch変更
            self.calc_spectrogram_func = lambda filepath: calc_spectrogram(filepath,
                                                                   lb_ms=self.lb,ub_ms=self.ub,
                                                                   speed=True,pitch=True,
                                                                   time_shift=True,noise=False)
        elif epoch <= self.milestones[4]:
        # noiseを添加
            self.calc_spectrogram_func = lambda filepath: calc_spectrogram(filepath,
                                                                   lb_ms=self.lb,ub_ms=self.ub,
                                                                   speed=True,pitch=True,
                                                                   time_shift=True,noise=True)


In [12]:
# https://stackoverflow.com/questions/43915482/how-do-you-create-a-custom-activation-function-with-keras
# 上記参考
def log_relu(x):
    return(K.log(K.relu(x)+0.001))

In [13]:
# 学習モデル、コールバックの定義
def build_cldnns(input_shape,num_classes):
    
    # ラベルの数
    output_dim = num_classes

    #学習モデルの構築
    sr = 22050
 #   duration_sec = 3

    conv_filters = 400
    
    kernel_size = 3
#    pool_size = 2
    time_dense_size = 32

    window_size_sec = 35 * (10 ** -3)

#    input_shape = [duration_sec*sr,1]

    # TODO : Conv1Dに変更    

    act = 'relu'

    input_data = Input(name='the_input', 
                       shape=input_shape, 
                       dtype='float32')

    inner = Conv1D(filters = conv_filters, 
                   kernel_size = int(window_size_sec * sr),
                   padding='valid',
                   kernel_initializer='glorot_uniform',
                   name='conv1')(input_data)

    inner = MaxPooling1D(pool_size= input_shape[0] - int(window_size_sec * sr) + 1, name='max1')(inner)

    inner = Activation(log_relu,name="act1")(inner)
    #inner = Activation("relu",name="act1")(inner)
    #inner = Activation(log_relu,name="relu")(inner)

    inner = Reshape((conv_filters,1),name="reshape1")(inner)

    inner = Conv1D(filters=256,
                   kernel_size= 8, padding='valid',
                   kernel_initializer='glorot_uniform',
                   activation="relu",#linear
                   name='conv2')(inner)

    #inner = BatchNormalization(name="batch_norm1")(inner)

    # 以下のactivationをするなら、conv2のactをlinearにする
    #inner = Activation("relu",name="act2")(inner)


    inner = MaxPooling1D(pool_size = 3, 
                         name='max2')(inner)

    timesteps = 32
    data_dim = 512
    rnn_size = 832

    inner = LSTM(rnn_size, return_sequences=True,
                 input_shape=(timesteps, data_dim),
                 name = "lstm1")(inner)

    inner = LSTM(rnn_size, return_sequences=True,
                 input_shape=(timesteps, data_dim),
                 name = "lstm2")(inner)

    inner = LSTM(rnn_size, return_sequences=True,
                 input_shape=(timesteps, data_dim),
                 name = "lstm3")(inner)

    inner = Flatten(name = "flatten")(inner)

    inner = Dense(1024,name = "dense1")(inner)

    inner = BatchNormalization(name="batch_norm1")(inner)

    inner = Activation("relu",name="act2")(inner)

    out = Dense(output_dim,activation="sigmoid",name = "dense2")(inner)

    model = Model(inputs=input_data, outputs=out)

    #lr = 0.001でうまくいかなければ、ADAMにしてもよいかも
    # callbacksにf1 scoreを追加したい
    # https://qiita.com/koshian2/items/81abfc0a75ea99f726b9
    
    # Adamのデフォルト学習率に設定
    LEARNING_RATE = 0.001

    sgd = SGD(lr=LEARNING_RATE,
              decay=1e-6, 
              momentum=0.9,
              nesterov=True,
              clipnorm=5)

    model.compile(loss = "sparse_categorical_crossentropy", optimizer=sgd,metrics = ["accuracy"])
    
    return model


In [14]:
class RawWaveGenerator(keras.callbacks.Callback):
    def __init__(self,minibatchsize,duration_sec,data_df,label2int,shuffle = True):
        self.duration_sec = duration_sec
        self.minibatchsize = minibatchsize 
        self.cur_index = 0
      #  self.wave_length = wave_length
#        self.step = step
        self.data_df = data_df
        self.data_size = data_df.shape[0]
        self.iteration = np.floor(self.data_size/minibatchsize)
        self.sr = 22050
        self.label2int = label2int
        self.shuffle = shuffle
        
    def shuffle_df(self):
        # 学習データの学習順序をシャッフル
        self.data_df = self.data_df.sample(frac=1).reset_index(drop=True) #.loc[self.cur_index:self.cur_index+self.minibatchsize-1,:]
    
    def get_batch(self,minibatchsize,duration_sec):
        
        num_classes = len(self.label2int)
        
        inputs = np.zeros([minibatchsize,duration_sec*self.sr,1])
        outputs = np.zeros([minibatchsize,num_classes])
        
        train_batch_df = self.data_df.loc[self.cur_index:self.cur_index+self.minibatchsize-1,:]
#        print(self.cur_index)
#        print(self.cur_index+minibatchsize-1)
#       print(train_batch_df)
        for i,(index,v) in enumerate(train_batch_df.iterrows()):
            #    print(v)
           # print(i)
            label = v["label"]
            wave_path = v["raw_wave_path"]
    
            with open(wave_path,mode="rb") as f:
                tmp = pickle.load(f)
                sr = tmp["sampling_rate"]
                raw_wave = tmp["raw_wave"]

            # 基本的にサンプリングレートは22050のはず
            assert sr == 22050

            # 変数名aは小さなスコープしかないから許して
            a = raw_wave[:duration_sec*sr]
            # データがduration秒
            a = np.pad(a, [0,duration_sec*sr-len(a)],'constant')
            
#            print(a.shape)
#            print(inputs.shape)
#            print(i)

            inputs[i,:,0] = a
            outputs[i,:] = to_categorical(self.label2int[label],num_classes=num_classes)
            
        self.cur_index = self.cur_index + minibatchsize
        
        return (inputs,outputs)
                
    def next_train(self):
        self.cur_index = 0
        
        if self.shuffle:
            self.shuffle_df()
        
        while True:    
            if self.cur_index / self.minibatchsize > self.iteration:
                self.cur_index = 0
            yield self.get_batch(minibatchsize = self.minibatchsize,
                                 duration_sec = self.duration_sec)

#    def next_val(self):
#        while True:           
#            yield self.get_batch(minibatchsize = self.minibatchsize,
#                                 duration_sec = self.duration_sec)


In [15]:
# f1loss, macro_f1 metricsを定義
def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    
    return 1 - K.mean(f1)

In [16]:
if __name__ == "__main__":
    sr = 22050
    duration_sec = 3
    input_shape = [sr*duration_sec,1]
    num_classes = 6
    #models = build_cldnns(input_shape,num_classes)
    
    ## TODO : 上記、クラス、関数のテストコードを以下に記載
    

In [56]:
if __name__ == "__main__":
    # SpectrogramGeneratorのテスト
    # 学習データの読み込み
    # ワーキングディレクトリの設定
    
    ## 学習データ
    work_dir = "/home/taichi/DataAnalysis/05_NTT_corevo/"
    
    train_info = pd.read_csv(work_dir + "01_input/ntt_corevo/class_train.tsv",
                        delimiter = "\t",
                        names = ["filename","label"])

    train_info["filepath"] = work_dir + "01_input/ntt_corevo/train/" + train_info["filename"] + ".wav"
    
    ## テストデータ
    test_info = pd.read_csv(work_dir + "03_work/test_time_distribution.csv")
    
    test_info["filepath"] = work_dir + "01_input/ntt_corevo/" + test_info["filename"] + ".wav"
        
    ## Generatorのパラメタを定義
    with open(work_dir + "03_work/label2int.pickle",mode = "rb") as f:
        label2int = pickle.load(f)
        
    lb=0
    ub=500
    speaker_dict=label2int
    BATCH_SIZE=7
    time_len = 500
    
    milestones = [-1,-1,-1,-1,100]
    datagen = SpectrogramGenerator(lb,ub,train_info,#.head(n),
                                     speaker_dict,BATCH_SIZE,time_len,test_info,
                                   milestones)
    