In [1]:
# signateのデータに対してCLDNNsを実装

In [2]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import librosa.display # 波形のプロットに必要
import IPython.display as ipd #jupyter-notebook上で音声再生
import glob
import re
import seaborn as sns
import pickle
import scipy.signal as ss
import os

# keras
import keras
#from keras.datasets import mnist
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, Flatten,Activation
from keras.layers import Conv2D, MaxPooling2D,BatchNormalization
from keras.optimizers import Adam, SGD
from keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from keras.callbacks import ModelCheckpoint,EarlyStopping

# モデル可視化用
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

Using TensorFlow backend.


In [3]:
# jupyter-notebookのcellの幅を広げる
from IPython.core.display import display,HTML
display(HTML("<style>.container{width:100%!important;}</style>"))

In [64]:
# 乱数のシードを固定
np.random.seed(123)

In [4]:
# 作業ディレクトリの設定
os.chdir("/home/taichi/DataAnalysis/05_NTT_corevo")

In [5]:
train_info = pd.read_csv("01_input/ntt_corevo/class_train.tsv",
                        delimiter = "\t",
                        names = ["filename","label"])

train_info["raw_wave_path"] = "03_work/raw_waves/train/" + train_info["filename"] + ".pickle"

In [11]:
train_info.head()

Unnamed: 0,filename,label,raw_wave_path
0,0002f1cd968ca78ada9e1c7037224773,MA_CH,03_work/raw_waves/train/0002f1cd968ca78ada9e1c...
1,0003747ec9268461d4cbb9e1b86e9663,FE_AD,03_work/raw_waves/train/0003747ec9268461d4cbb9...
2,0003b32f378b001f0f73bf0981da8773,MA_CH,03_work/raw_waves/train/0003b32f378b001f0f73bf...
3,0004ab975bf8b59e1b19f2b7b6d1548b,MA_CH,03_work/raw_waves/train/0004ab975bf8b59e1b19f2...
4,0005678b57ca265a65f8ef0cc7481277,MA_AD,03_work/raw_waves/train/0005678b57ca265a65f8ef...


In [6]:
train_info["raw_wave_path"][0]

'03_work/raw_waves/train/0002f1cd968ca78ada9e1c7037224773.pickle'

In [7]:
# ファイルが存在するか確認
#!ls -lad 03_work/raw_waves/train/0002f1cd968ca78ada9e1c7037224773.pickle 

In [8]:
train_info.head()

Unnamed: 0,filename,label,raw_wave_path
0,0002f1cd968ca78ada9e1c7037224773,MA_CH,03_work/raw_waves/train/0002f1cd968ca78ada9e1c...
1,0003747ec9268461d4cbb9e1b86e9663,FE_AD,03_work/raw_waves/train/0003747ec9268461d4cbb9...
2,0003b32f378b001f0f73bf0981da8773,MA_CH,03_work/raw_waves/train/0003b32f378b001f0f73bf...
3,0004ab975bf8b59e1b19f2b7b6d1548b,MA_CH,03_work/raw_waves/train/0004ab975bf8b59e1b19f2...
4,0005678b57ca265a65f8ef0cc7481277,MA_AD,03_work/raw_waves/train/0005678b57ca265a65f8ef...


In [15]:
# label2dictをよみこみ
with open("03_work/label2int.pickle",mode = "rb") as f:
    label2int = pickle.load(f)

In [10]:
label2int

{'FE_AD': 0, 'FE_CH': 1, 'FE_EL': 2, 'MA_AD': 3, 'MA_CH': 4, 'MA_EL': 5}

In [None]:
# データセットの分割

In [12]:
i = 0
wave_path = train_info.loc[i,"raw_wave_path"]

In [13]:
wave_path

'03_work/raw_waves/train/0002f1cd968ca78ada9e1c7037224773.pickle'

In [20]:
tmp

{'raw_wave': array([-0.00245125, -0.00314045, -0.00422269, ...,  0.16820887,
         0.16100122,  0.        ], dtype=float32), 'sampling_rate': 22050}

In [29]:
len(raw_wave)/sr

3.1100226757369613

In [108]:
train_info.head()

Unnamed: 0,filename,label,raw_wave_path
0,0002f1cd968ca78ada9e1c7037224773,MA_CH,03_work/raw_waves/train/0002f1cd968ca78ada9e1c...
1,0003747ec9268461d4cbb9e1b86e9663,FE_AD,03_work/raw_waves/train/0003747ec9268461d4cbb9...
2,0003b32f378b001f0f73bf0981da8773,MA_CH,03_work/raw_waves/train/0003b32f378b001f0f73bf...
3,0004ab975bf8b59e1b19f2b7b6d1548b,MA_CH,03_work/raw_waves/train/0004ab975bf8b59e1b19f2...
4,0005678b57ca265a65f8ef0cc7481277,MA_AD,03_work/raw_waves/train/0005678b57ca265a65f8ef...


In [107]:
train_df = train_info.sample(frac=1).reset_index(drop=True)

In [95]:
#i = 0
#wave_path = train_info.loc[i,"raw_wave_path"]
#label = train_info.loc[i,"label"]

cur_index = 0
duration_sec = 5
minibatchsize=32

inputs = np.zeros([minibatchsize,duration_sec*sr,1])
outputs = np.zeros(minibatchsize)

# 学習データの学習順序をシャッフル
train_batch_df = train_info.sample(frac=1).reset_index(drop=True).loc[cur_index:cur_index+minibatchsize-1,:]

for i,v in train_batch_df.iterrows():
#    print(v)
    print(i)
    label = v["label"]
    wave_path = v["raw_wave_path"]
    
    with open(wave_path,mode="rb") as f:
        tmp = pickle.load(f)
        sr = tmp["sampling_rate"]
        raw_wave = tmp["raw_wave"]

    # 基本的にサンプリングレートは22050のはず
    assert sr == 22050

    # 変数名aは小さなスコープしかないから許して
    a = raw_wave[:duration_sec*sr]
    # データがduration秒
    a = np.pad(a, [0,duration*sr-len(a)],'constant')

    inputs[i,:,0] = a
    outputs[i] = label2int[label]
#    break

#return (inputs,outputs)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31


SyntaxError: 'return' outside function (<ipython-input-95-edc76a62a4a4>, line 38)

In [109]:
sr

22050

In [51]:
duration*sr

110250

In [None]:
# batchsizeで割り切れない場合を考慮しないといけない。

In [97]:
train_batch_df.shape[0]

32

In [161]:
# filepathとlabelを入力として受け取るRAW_WAVE_GENERATORを実装
class RawWaveGenerator(keras.callbacks.Callback):
    def __init__(self,minibatchsize,duration_sec,data_df):
        self.duration_sec = duration_sec
        self.minibatchsize = minibatchsize 
        self.cur_index = 0
      #  self.wave_length = wave_length
#        self.step = step
        self.data_df = data_df
        self.data_size = data_df.shape[0]
        self.iteration = np.floor(self.data_size/minibatchsize)
        self.sr = 22050
    
    def get_batch(self,minibatchsize,duration_sec):
        inputs = np.zeros([minibatchsize,duration_sec*self.sr,1])
        outputs = np.zeros(minibatchsize)

        # 学習データの学習順序をシャッフル
        train_batch_df = self.data_df.sample(frac=1).reset_index(drop=True).loc[cur_index:cur_index+minibatchsize-1,:]
        
        for i,v in train_batch_df.iterrows():
            #    print(v)
           # print(i)
            label = v["label"]
            wave_path = v["raw_wave_path"]
    
            with open(wave_path,mode="rb") as f:
                tmp = pickle.load(f)
                sr = tmp["sampling_rate"]
                raw_wave = tmp["raw_wave"]

            # 基本的にサンプリングレートは22050のはず
            assert sr == 22050

            # 変数名aは小さなスコープしかないから許して
            a = raw_wave[:duration_sec*sr]
            # データがduration秒
            a = np.pad(a, [0,duration_sec*sr-len(a)],'constant')

            inputs[i,:,0] = a
            outputs[i] = label2int[label]
            
        self.cur_index = self.cur_index + minibatchsize
        
        return (inputs,outputs)
                
    def next_train(self):
        self.cur_index = 0
        while True:    
            assert self.cur_index < self.iteration
            yield self.get_batch(minibatchsize = self.minibatchsize,
                                 duration_sec = self.duration_sec)

#    def next_val(self):
#        while True:           
#            yield self.get_batch(minibatchsize = self.minibatchsize,
#                                 duration_sec = self.duration_sec)


In [162]:
data_gen = RawWaveGenerator(minibatchsize=32,duration_sec=5,data_df=train_info)

In [163]:
for i,v in enumerate(data_gen.next_train()):
    if i == 5:
        break

In [164]:
data_gen.cur_index

192