In [1]:
import numpy as np
import pandas as pd
from scipy import fftpack
from scipy import signal
import datetime
import pickle
import time
%matplotlib inline

In [2]:
p_path="/home/takeyama/pywork/ipython/2016-05-30/"

In [3]:
cd ~/Documents/SyncRecord/cleaning-addingLABEL/

/home/takeyama/Documents/SyncRecord/cleaning-addingLABEL


In [4]:
class SensorData:
    
    def __init__(self):
        print "__class__"
        # raw data
        self._RawData={}    
        # fft data
        self._FFTData={}
        # power spectol data
        self._PowerData={}    
        # flag exsist data
        self._Flag_exist_data=False
        
        self._columns=['AccX','AccY','AccZ','GyrX','GyrY','GyrZ']
        self._fft_col=['fft_AccX','fft_AccY','fft_AccZ','fft_GyrX','fft_GyrY','fft_GyrZ']
        self._power_col=['power_AccX','power_AccY','power_AccZ','power_GyrX','power_GyrY','power_GyrZ']
        
    def ImportCSV(self,Sclass,csv_file):
        self.ClassName=Sclass
        self.Flag_exist_data=True
        # design dataframe
        data = pd.read_csv(csv_file,encoding="SHIFT-JIS")
        data.columns=[u'Type',u'Time',u'AccX',u'AccY',u'AccZ',u'GyrX',u'GyrY',u'GyrZ']
        data.Time=pd.to_datetime(data.Time)
        data = pd.pivot_table(data,values=[u'AccX',u'AccY',u'AccZ',u'GyrX',u'GyrY',u'GyrZ'],index=[u'Time'])
        
        # convert numpy.darray
        AccX=data.AccX.values*0.0001
        AccY=data.AccY.values*0.0001
        AccZ=data.AccZ.values*0.0001
        GyrX=data.GyrX.values*0.01
        GyrY=data.GyrY.values*0.01
        GyrZ=data.GyrZ.values*0.01
        Time=data.index.to_pydatetime().astype('datetime64[ns]')
        
        # regist each raw data 
        self._RawData['AccX'] = AccX
        self._RawData['AccY'] = AccY
        self._RawData['AccZ'] = AccZ
        self._RawData['GyrX'] = GyrX
        self._RawData['GyrY'] = GyrY
        self._RawData['GyrZ'] = GyrZ
        self._RawData['Time'] = Time
    
    def ShowFlagExistData(self):
        return self.Flag_exist_data
    
    def GetColumns(self):
        return self._columns
    
    def GetTime(self):
        return self._RawData['Time']

    def ShowAllDf(self):
        print 'AccX : ';print self._RawData['AccX']
        print 'AccY : ';print self._RawData['AccY']
        print 'AccZ : ';print self._RawData['AccZ']
        print 'GyrX : ';print self._RawData['GyrX']
        print 'GyrY : ';print self._RawData['GyrY']
        print 'GyrZ : ';print self._RawData['GyrZ']
        
    def _Time2Num(self,time):
        return np.where(self._RawData['Time']==np.datetime64(time) )[0][0]
    
    def ShowQuery(self,Sname,rng=[]):
        data = self._RawData[Sname]
        print Sname+':'+str( data[rng[0]:rng[1]])
            
    def _sliding_window(self,Sname,samp, overlap):
        count =0
        s =self._RawData['Time'][0]
        start=self._Time2Num(s)
        g = s+np.timedelta64(samp*10,'ms')
        goal= self._Time2Num(g)
        yield self._RawData[Sname][start:goal]
        
        add=overlap*0.01
        
        while True:
            try:
                count +=1
                s =s+np.timedelta64(samp*10,'ms')
                start=self._Time2Num(s)
                g  =s+np.timedelta64(samp*10,'ms')
                goal= self._Time2Num(g)
                yield self._RawData[Sname][start:goal]
            except StopIteration:
                print '_sliding_window StopIteration'
                break   
            except IndexError:
                print '_sliding_window IndexError'
                break  
                
# Fast Frier transaction            
    def GetFFT(self,Sfft,samp):
        return np.load(p_path+'fft/'+self.ClassName+'_'+Sfft+'_'+str(samp)+'.npz')['arr_0']
    
    def CalcFFT(self,samp,overlap=0.5):
        start = time.time()
        fft_data = np.array([])
        
        for n,f in zip( self._columns,self._fft_col):
            print 'start'+n+'->'+f
            sw = self._sliding_window(n,samp,overlap)
            while True:
                try:
                    d=sw.next()
                    fft_data = np.append(fft_data, fftpack.fft(d)[1:(samp/2)+1] )   # fftの直流成分を除くsample/2の
                except StopIteration:
                    print 'CalcFFTStopIteration'
                    fft_data = fft_data.reshape(len(fft_data)/(samp/2),(samp/2) )
                    self._FFTData[f] = fft_data
                    np.savez(p_path+'fft/'+self.ClassName+'_'+str(f)+'_'+str(samp),self._FFTData[f])
                    break   
        elapsed_time = time.time() - start
        print ("elapsed_time:{0}".format(elapsed_time)) + "[sec]"
           

# Spectol Power 
    def GetPower(self,Spower,samp):
        return np.load(p_path+'power/'+self.ClassName+'_'+Spower+'_'+str(samp)+'.npz')['arr_0']
    
    def _power(self,fft_array):
        p=lambda x,y : np.sqrt(x**2+y**2)
        power_array=np.array([])
        for vector in fft_array:
            tmp = p( np.real(vector),np.imag(vector) )
            tmp = tmp/np.sum(tmp)
            power_array = np.append(power_array,tmp)
        return power_array
        
    def CalcPower(self,samp,overlap=0.5):
        start = time.time()
        for fft_name,power_name in zip( self._fft_col, self._power_col):
            print 'start'+fft_name+'->'+power_name
            fft_data = self.GetFFT(fft_name,samp)
            power_data=self._power(fft_data)
            np.savez(p_path+'power/'+self.ClassName+'_'+power_name+'_'+str(samp),power_data)
        elapsed_time = time.time() - start
        print ("elapsed_time:{0}".format(elapsed_time)) + "[sec]"

センサーのスタート時間を求める

In [5]:
data1 = SensorData()
data2 = SensorData()
data3 = SensorData()
data4 = SensorData()
data5 = SensorData()

__class__
__class__
__class__
__class__
__class__


In [6]:
data1.ImportCSV('left_hand','Conv-left-hand-ags.csv')
data2.ImportCSV('left_leg','Conv-left-leg-ags.csv')
data3.ImportCSV('right_hand','Conv-right-hand-ags.csv')
data4.ImportCSV('right_leg','Conv-right-leg-ags.csv')
data5.ImportCSV('west','Conv-west-ags.csv')

In [7]:
time1=data1.GetTime()
time2=data2.GetTime()
time3=data3.GetTime()
time4=data4.GetTime()
time5=data5.GetTime()

In [8]:
if min(time1) < min(time2):
    MAX = min(time2)
else:
    MAX = min(time1)

if MAX < min(time3):
    MAX = min(time3)

if MAX < min(time4):
    MAX = min(time4)

if MAX < min(time5):
    MAX = min(time5)

print MAX

2016-06-07T18:06:15.877000000


Conv-から始まるファイルはすでにhh:mm:ssSSのように変換されている。
しかし、この形では四捨五入といった丸めることができない。

よって、最初のmsの状態で位置の桁を四捨五入すれば簡単に丸めることができるのではないだろうか？
試しに、値をleft-hand.csvの時刻から四捨五入して、hh:mm:ssSSの形に直してみる。

In [9]:
np.array(55844456,dtype='datetime64[ms]')

array(datetime.datetime(1970, 1, 1, 15, 30, 44, 456000), dtype='datetime64[ms]')

In [10]:
np.datetime64(55844456,'ms')

numpy.datetime64('1970-01-01T15:30:44.456')

In [11]:
round(55844456*0.1)*10

55844460.0

In [12]:
a=round(55844456*0.1)*10
np.datetime64(int(a),'ms')

numpy.datetime64('1970-01-01T15:30:44.460')

In [13]:
np.datetime64(55844456,'ms')

numpy.datetime64('1970-01-01T15:30:44.456')

CSVファイルをnumpyで読み込んでクラスのインポートと同じような動作ができるかどうかやってみる。

In [15]:
data[0]

NameError: name 'data' is not defined

In [16]:
data.shape

NameError: name 'data' is not defined

In [17]:
time = np.genfromtxt("left-hand.csv",delimiter=",",usecols=(1)) #時間の列だけを抽出

In [18]:
func = lambda x: int(round(x*0.1)*10)
a= map( func, time) 

t = lambda x: np.datetime64(x,'ms')
times = map(t, a)
time = np.array(times,dtype=np.datetime64)

In [19]:
time[0]

numpy.datetime64('1970-01-01T18:06:15.700')

In [20]:
np.datetime64(55844456,'ms')

numpy.datetime64('1970-01-01T15:30:44.456')

前処理クラス　時系列の統一　アルゴリズム　考案　2016-06-07

現在の作業状況の途中経過をここにメモする。
まずは、肝心の時系列だがpandasで読み込むよりnumpy.loadcsv使ったほうが正確である
しかし、この読み込み方法だと問題が１つある。それは、加速度データがなぜかよみこまれない
からである。
一番怪しいのは、マイナスとか符号が原因でエラーが起きているのではないかと考える。


In [21]:
#sampling freqency 
samp=10

サンプリング周波数を決め、切り上げ、切り捨て、四捨五入を関数を使用しないで実装してみる
この計算では各データの単位はmsで行う。
理由として、csvファイルの時刻は当日０時から経過したmsであるから

In [22]:
time = np.genfromtxt("left-hand.csv",delimiter=",",usecols=(1)) #時間の列だけを抽出

In [23]:
time

array([ 65175696.,  65175706.,  65175716., ...,  65730376.,  65730386.,
        65730396.])

In [24]:
#切り捨て
output = (time/samp).astype(int)*samp
output

array([65175690, 65175700, 65175710, ..., 65730370, 65730380, 65730390])

In [25]:
#切り上げ
output = ((time+samp)/samp).astype(int)*samp
output

array([65175700, 65175710, 65175720, ..., 65730380, 65730390, 65730400])

In [26]:
#四捨五入
output = ( (time+samp/2)/samp).astype(int)*samp
output

array([65175700, 65175710, 65175720, ..., 65730380, 65730390, 65730400])

結局、最初のデータは時間を指すことはわかるが、２番めの数字は元データをみても該当するものはない。
よって、２通りのCSV読む方法を取る。１つはpandas、もう１つはnumpy.loadtxtである。
pandasのほうでは、加速度、角速度を読み込む
numpyのほうでは、時系列を読み込む

In [27]:
'''
csv_file -> ファイル名
mode Round -> 四捨五入
     Roundup -> 切り上げ
     Rounddown -> 切り捨て
'''
def ImportCSV(csv_file,samp,mode='Round'):
        # data dictionary 
        RawData={}   
        
        # design dataframe and import csv
        data = pd.read_csv(csv_file,encoding="SHIFT-JIS")
        data.columns=[u'Type',u'Time',u'AccX',u'AccY',u'AccZ',u'GyrX',u'GyrY',u'GyrZ']
        data.Time=pd.to_datetime(data.Time)
        data = pd.pivot_table(data,values=[u'AccX',u'AccY',u'AccZ',u'GyrX',u'GyrY',u'GyrZ'],index=[u'Time'])
        
        # convert numpy.darray 
        # Acc Data  [0.1mG]=>[G]
        # Gyr Data  [0.01dps]=>[dps]   ...dps=degree per second
        AccX=data.AccX.values*0.0001
        AccY=data.AccY.values*0.0001
        AccZ=data.AccZ.values*0.0001
        GyrX=data.GyrX.values*0.01
        GyrY=data.GyrY.values*0.01
        GyrZ=data.GyrZ.values*0.01
        
        # regist each raw data 
        RawData['AccX'] = AccX
        RawData['AccY'] = AccY
        RawData['AccZ'] = AccZ
        RawData['GyrX'] = GyrX
        RawData['GyrY'] = GyrY
        RawData['GyrZ'] = GyrZ
        
        # import time by using numpy
        time = np.genfromtxt(csv_file,delimiter=",",usecols=(1)) #時間の列だけを抽出       
        
        if mode == 'Roundup':
            func = lambda x: int(x/samp)*samp
        elif mode == 'Rounddown':
            func = lambda x: int(x/samp)*samp
        elif mode == 'Round':
            func = lambda x: int((x+samp/2)/samp)*samp
        #ERROR
        else:
            print 'check mode and inputed word is caused error'
            return -1
        
        output = map(func,time)
        RawData['Time'] = output
        return RawData
            

In [28]:
check1 = ImportCSV('left-hand.csv',10)

In [29]:
check1['Time']

[65175700,
 65175710,
 65175720,
 65175730,
 65175740,
 65175750,
 65175760,
 65175770,
 65175780,
 65175790,
 65175800,
 65175810,
 65175820,
 65175830,
 65175840,
 65175850,
 65175860,
 65175870,
 65175880,
 65175890,
 65175900,
 65175910,
 65175920,
 65175930,
 65175940,
 65175950,
 65175960,
 65175970,
 65175980,
 65175990,
 65176000,
 65176010,
 65176020,
 65176030,
 65176040,
 65176050,
 65176060,
 65176070,
 65176080,
 65176090,
 65176100,
 65176110,
 65176120,
 65176130,
 65176140,
 65176150,
 65176160,
 65176170,
 65176180,
 65176190,
 65176200,
 65176210,
 65176220,
 65176230,
 65176240,
 65176250,
 65176260,
 65176270,
 65176280,
 65176290,
 65176300,
 65176310,
 65176320,
 65176330,
 65176340,
 65176350,
 65176350,
 65176360,
 65176370,
 65176380,
 65176390,
 65176400,
 65176410,
 65176420,
 65176430,
 65176440,
 65176450,
 65176460,
 65176470,
 65176480,
 65176490,
 65176500,
 65176510,
 65176520,
 65176530,
 65176540,
 65176550,
 65176560,
 65176570,
 65176580,
 65176590,

この関数を使用することによって、切り捨て切り上げ、四捨五入を行い
時系列の時間を10msごとのデータになった ->1の桁はゼロである

次に、各データの時系列を同期する

In [30]:
check2 = ImportCSV('left-leg.csv',10)
check3 = ImportCSV('right-hand.csv',10)
check4 = ImportCSV('right-leg.csv',10)
check5 = ImportCSV('west.csv',10)

センサの時系列の最小値を求める。そして、それぞれの最小値を比較して
その中で一番大きい値をスタート時間とする

In [31]:
if min(check1['Time']) < min(check2['Time']):
    MAX = min(check2['Time'])
else:
    MAX = min(check1['Time'])

if MAX < min(check3['Time']):
    MAX = min(check3['Time'])

if MAX < min(check4['Time']):
    MAX = min(check4['Time'])

if MAX < min(check5['Time']):
    MAX = min(check5['Time'])

print MAX

65175880


センサの時系列の最大値を求める。そして、それぞれの最大値を比較して
その中で一番小さい値をスタート時間とする

In [32]:
if max(check1['Time']) < max(check2['Time']):
    MIN = max(check1['Time'])
else:
    MIN = max(check2['Time'])

if MIN > max(check3['Time']):
    MIN = max(check3['Time'])

if MIN > max(check4['Time']):
    MIN = max(check4['Time'])

if MIN > max(check5['Time']):
    MIN = max(check5['Time'])

print MIN

65730150


In [33]:
print max(check1['Time'])
print max(check2['Time'])
print max(check3['Time'])
print max(check4['Time'])
print max(check5['Time'])

65730400
65730970
65730150
65730810
65730620


各センサの時系列データをスタート時間を０番目となるようにする


In [34]:
for i in range(0, len(check1['Time'])):
    if 65175880 == check1['Time'][i]:
            print 'start index number is '+str(i)

start index number is 18


In [35]:
check1['Time'][18]

65175880

In [36]:
for i in range(0, len(check2['Time'])):
    if 65175880 == check2['Time'][i]:
            print 'start index number is '+str(i)

start index number is 0


In [43]:
check2['Time'][0]

65175880

In [37]:
for i in range(0, len(check3['Time'])):
    if 65175880 == check3['Time'][i]:
            print 'start index number is '+str(i)

start index number is 22


In [44]:
check3['Time'][22]

65175880

In [38]:
for i in range(0, len(check4['Time'])):
    if 65175880 == check4['Time'][i]:
            print 'start index number is '+str(i)

start index number is 8


In [45]:
check4['Time'][8]

65175880

In [39]:
for i in range(0, len(check5['Time'])):
    if 65175880 == check5['Time'][i]:
            print 'start index number is '+str(i)

start index number is 13


In [46]:
check5['Time'][13]

65175880

上記の方法で求めたスタート時間とゴール時間の間を扱う。
しかし、その区間にもセンサによっては取得できてない可能性もあるので確認する

手法としては、スタート時間からゴール時間をステップ+10とした連続値の配列を作成する
各センサデータの時系列はこの配列（以後、参照配列と呼称する）と比較してNANがあるかどうか調べる
NANがある場合、同じ大きさの配列のフラグにFalseを入れる。ない場合はTrueを入れる


In [40]:
ReferenceIndex = range(65175880,65730150,10)
Flag_ExistData = range(0, len(ReferenceIndex))

In [47]:
for i in range(0, len(ReferenceIndex)):
    if (check1['Time'][i+18]==check2['Time'][i]==check3['Time'][i+22]==check4['Time'][i+8]==check5['Time'][i+13]==ReferenceIndex[i]):
        #print str(i)+' is all sensor data complete'
        Flag_ExistData[i]=True
    else:
        print str(i)+' is data has nan data'
        Flag_ExistData[i]=False

5 is data has nan data
6 is data has nan data
7 is data has nan data
8 is data has nan data
9 is data has nan data
10 is data has nan data
11 is data has nan data
12 is data has nan data
13 is data has nan data
14 is data has nan data
15 is data has nan data
16 is data has nan data
17 is data has nan data
18 is data has nan data
19 is data has nan data
20 is data has nan data
21 is data has nan data
22 is data has nan data
23 is data has nan data
24 is data has nan data
25 is data has nan data
26 is data has nan data
27 is data has nan data
28 is data has nan data
29 is data has nan data
30 is data has nan data
31 is data has nan data
32 is data has nan data
33 is data has nan data
34 is data has nan data
35 is data has nan data
36 is data has nan data
37 is data has nan data
38 is data has nan data
39 is data has nan data
40 is data has nan data
41 is data has nan data
42 is data has nan data
43 is data has nan data
44 is data has nan data
45 is data has nan data
46 is data has nan da

In [42]:
for i in range(0, len(ReferenceIndex)):
    if (check1['Time'][i+18]==check2['Time'][i]):
        #print str(i)+' is all sensor data complete'
        Flag_ExistNAN[i]=True
    else:
        print str(i)+' is data has nan data'
        Flag_ExistNAN[i]=False

NameError: name 'Flag_ExistNAN' is not defined