In [39]:
import numpy as np 
import pandas as pd
from os.path import dirname, join as pjoin
import scipy.signal

# Data cleaning/preprocessing

In [40]:
# This a class that will store the emg data of each segments as well their label (skill level)
class segment():
    def __init__(self,emg,level,subject_number,run_number,segment_number):
        self.emg = emg #an np.array of shape 16 X nb_sample (16 is the number of emg channel)
        self.level = level #skill level/label of the subject (novice,intermediat or expert)
        self.subject = subject_number #number of the subject (from 1 to 12 )
        self.run = run_number #number of the run,max 7(some intermdiate numbers may not be present)
        self.segment = segment_number
        self.nb_samples = emg.shape[1]
        self.nb_channel = emg.shape[0]
        self.nb_features = 14
        #self.features_channel = np.empty((self.nb_channel,1))  #an np array of shape 16 X nb_features 
        #self.features = np.empty(0) #an np array of shape nb_features (all the channel are combined)
    
    def compute_features(self):
        
        features_list = np.empty((self.nb_channel,self.nb_features))
        
        for i in range(self.nb_channel):
    
            features_channel = np.empty(0)
            iemg,mav,ssi,rms = self.iemg_mav_ssi_rms()
            features_channel = np.append(features_channel,iemg[0][i])
            features_channel = np.append(features_channel,mav[0][i])
            features_channel = np.append(features_channel,ssi[0][i])
            features_channel = np.append(features_channel,rms[0][i])
            features_channel = np.append(features_channel,self.var()[0][i])
            features_channel = np.append(features_channel,self.ssc()[0][i])
            features_channel = np.append(features_channel,self.zc()[0][i])
            features_channel = np.append(features_channel,self.signal_range()[0][i])
            features_channel = np.append(features_channel,self.wamp()[0][i])
            features_channel = np.append(features_channel,self.wl()[0][i])
            #---------------------------------------------------------------
            fmn,fmd,psmn,psmd = self.fmn_fmd_psmn_psmd()
            features_channel = np.append(features_channel,fmn[0][i])
            features_channel = np.append(features_channel,fmd[0][i])
            features_channel = np.append(features_channel,psmn[0][i])
            features_channel = np.append(features_channel,psmd[0][i])
            
            features_list[i] = features_channel
        
        
        return features_list #is not normalised        
    
    def iemg_mav_ssi_rms(self):
        #print("1")
        emg= self.emg
        emg = np.absolute(emg)
        nb_channel = self.nb_channel
        n = self.nb_samples
        iemg=np.empty((1,nb_channel))
        ssi=np.empty((1,nb_channel))
        mav=np.empty((1,nb_channel))
        rms=np.empty((1,nb_channel))

        for i in range(nb_channel):
            iemg[0][i]= np.sum(emg[i])
            ssi[0][i] = np.sum(emg[i]**2)
        if n !=0 : 
            mav = (1/n)*iemg
            rms = ((1/n)*ssi)**(0.5)
        if n==0: 
            print("No samples")
            mav=np.zeros((1,nb_channel))
            rms=np.zeros((1,nb_channel))
        return iemg,mav,ssi,rms
    
    def var(self):
        #print("var")
        emg=self.emg
        nb_channel = self.nb_channel
        var = np.empty((1,nb_channel))
        for i in range(0,nb_channel):
            var[0][i]=np.var(emg[i])
        return var
    
    def ssc(self, x_thresh = 0.001): #threshold TBD
        #print("ssc")
        emg=self.emg
        nb_channel = self.nb_channel
        ssc = np.empty((1,nb_channel))
         
        for i in range (nb_channel):
            n= emg[i].shape[0] #number of samples
            f_i = 0  
            for j in range(1,n-1):
                cond_1 = emg[i][j]>emg[i][j-1] and emg[i][j]>emg[i][j+1]
                cond_2 = emg[i][j]<emg[i][j-1] and emg[i][j]<emg[i][j+1]
                cond_3 = abs(emg[i][j]- emg[i][j+1])> x_thresh
                cond_4 = abs(emg[i][j]- emg[i][j-1])> x_thresh
                if cond_1 or cond_2 or cond_3 or cond_4:
                    f_i += 1
            if n !=0 :
                ssc[0][i] = (1/n)*f_i
            if n==0:
                ssc[0][i] = 0 
        return ssc
    
    def zc(self, x_thresh = 0.001): #zero crossings
        #print("zc")
        emg=self.emg
        nb_channel = self.nb_channel
        zc = np.empty((1,nb_channel))
        
        
        for i in range(nb_channel):
            f_i = 0 
            n= emg[i].shape[0] #number of samples 
            for j in range(n-1):
                cond_1 = emg[i][j]*emg[i][j+1]>0
                cond_2 = abs(emg[i][j] - emg[i][j+1])>x_thresh
                if cond_1 and cond_2 :
                    f_i += 1
                if n !=0 :
                    zc[0][i] = (1/n)*f_i
                if n==0: 
                    zc[0][i]
        return zc
    
    def signal_range(self):
        #print("sr")
        emg=self.emg
        nb_channel = self.nb_channel
        signal_range = np.empty((1,nb_channel))
        for i in range(nb_channel):
            n= emg[i].shape[0]
            if n !=0:
                maxi = emg[i].max()
                mini = emg[i].min()
                signal_range[0][i]=maxi-mini  
            if n==0: 
                signal_range[0][i]=0
        return signal_range
    
    def wamp(self,t=0.001):
        #print("wamp")
        emg=self.emg
        nb_channel = self.nb_channel
        wamp = np.empty((1,nb_channel))
        
        for i in range(nb_channel):
            n= emg[i].shape[0] #number of samples
            total = 0
            for j in range(1,n-1):
                if (abs(emg[i][j]-emg[i][j+1])>t):
                    total +=1
            if n!=0: 
                wamp[0][i] = (1/n)*total
            if n==0:
                wamp[0][i] = 0
        return wamp
    
    def wl(self): #waveform length 
        #print("wl")
        emg=self.emg
        nb_channel = self.nb_channel
        wl= np.empty((1,nb_channel))
        
        for i in range(nb_channel):
            n= emg[i].shape[0] #number of samples
            total = 0 
            for j in range(1,n-1):
                total += abs(emg[i][j]-emg[i][j-1])
            wl[0][i]= total
        return wl
    
    def fmn_fmd_psmn_psmd(self):
        #print("2")
        emg=self.emg
        nb_channel = self.nb_channel
        fmn= np.empty((1,nb_channel))
        fmd= np.empty((1,nb_channel))
        psmn= np.empty((1,nb_channel)) #mean of the psd 
        psmd= np.empty((1,nb_channel)) #median of the psd 
        for i in range(nb_channel):
            frequency,psd = scipy.signal.welch(emg[i], fs=0.2, scaling='density')
            fmn[0][i] = np.mean(frequency)
            fmd[0][i] = np.median(frequency)
            psmn[0][i] = np.mean(psd)
            psmd[0][i] = np.median(psd)
        return fmn,fmd,psmn,psmd
        

In [41]:
segments = np.empty(0)#this is the list that'll contain all the segments from the dataset


### Getting the data from the csv files :

In [42]:
data_dir = pjoin('dataset', 'subject_1', 'run_1')
file_name_e1 = pjoin(data_dir, 'segment_nb_510_emg.csv')

In [43]:
segments = np.empty(0)#this is the list that'll contain all the segments from the dataset


for i in range(1,12): #subject number form 1 to 12 
    data_dir = pjoin('dataset', f'subject_{i}')
    
    if i in [1,2,5,6,8,11]:
        group = 'intermediate'
    elif i in [7,9]:
        group ='novice'
    else:
        group='expert'
                    
    for j in range(1,8): #run/trial number (max 7 run/trial per subject)
        run_path = pjoin(data_dir , f'run_{j}')

        #--------------- normalization : --------------

        max_value = 0.0 #used for data normalization , the raw data will be divided by the maximum value of each trial/run 
        norma_const = 1.0
        for k in [560]: #to check the maxium across all the segments of the run (only 560 as it's all the segments toghether)
            try:
                seg_path = pjoin(run_path, f'segment_nb_{k}_emg.csv')
                data = pd.read_csv(seg_path)
                data.drop(columns=['index_global', 'index_buffer','absolute_time','relative_time'],axis=1,inplace=True)
                data = data.apply (pd.to_numeric, errors='coerce') #convert any non numeric value to Nan
                data.dropna(inplace=True, axis=0,how='any') #remove any row taht has a NaN
                data = data[(data != 0).all(axis=1)] #remove the rows with 0  in one of the emg chanell
                data = data[(data < 3e3).all(axis=1)] #remove the rows with very high value(artifacts) in one of the emg chanell
                data.reset_index(drop=True)

                
                emg_list = np.empty((16,len(data)),dtype=np.float64)
                
                for l in range(15): #emg chanels 
                    emg_list[l] = data[f"emg{l}"]
                    if emg_list[l].max() > max_value :
                        max_value = emg_list[l].max()
                 

            except FileNotFoundError:
                continue

        print("Maximum value for this trial is : ", max_value , "subject : " , i ,"trial ",j )
        if max_value !=0: 
            norma_const = 1/max_value
            
        #--------------- end of normalization code  ----------------------

        for k in [500,510,540,541,560]: # segment number
            try:
                seg_path = pjoin(run_path, f'segment_nb_{k}_emg.csv')
                data = pd.read_csv(seg_path)
                data.drop(columns=['index_global', 'index_buffer','absolute_time','relative_time'],axis=1,inplace=True)
                data = data.apply (pd.to_numeric, errors='coerce') #convert any non numeric value to Nan
                data.dropna(inplace=True, axis=0,how='any') #remove any row taht has a NaN
                data = data[(data != 0).all(axis=1)] #remove the rows with 0  in one of the emg chanell
                data = data[(data < 3e3).all(axis=1)] #remove the rows with very high value(artifacts) in one of the emg chanell
                data.reset_index(drop=True)

                
                emg_list = np.empty((16,len(data)),dtype=np.float64)
                
                for l in range(15): #emg chanels 
                    emg_list[l] = data[f"emg{l}"]
                    emg_list[l] = emg_list[l]*norma_const #we normalize with the value found earlier
                
                seg = segment(emg=emg_list,level=group,subject_number=i,run_number=j,segment_number=k)
                segments= np.append(segments,seg) #we add the segments to the gloabl list 

            except FileNotFoundError:
                continue


Maximum value for this trial is :  815.759758293405 subject :  1 trial  1
Maximum value for this trial is :  512.9144159265928 subject :  1 trial  2
Maximum value for this trial is :  811.0802535070852 subject :  1 trial  3
Maximum value for this trial is :  1119.0120141198968 subject :  1 trial  4
Maximum value for this trial is :  755.5365662607705 subject :  1 trial  5
Maximum value for this trial is :  0.0 subject :  1 trial  6
Maximum value for this trial is :  0.0 subject :  1 trial  7
Maximum value for this trial is :  1295.1038137964008 subject :  2 trial  1
Maximum value for this trial is :  0.0 subject :  2 trial  2
Maximum value for this trial is :  948.6170028789126 subject :  2 trial  3
Maximum value for this trial is :  1030.1014231798254 subject :  2 trial  4
Maximum value for this trial is :  0.0 subject :  2 trial  5
Maximum value for this trial is :  1178.3196508682515 subject :  2 trial  6
Maximum value for this trial is :  900.3977579068372 subject :  2 trial  7
Max

In [44]:
seg_test = segments[0]
print(seg_test.emg.shape)
print(type(seg_test))

(16, 9017)
<class '__main__.segment'>


In [45]:
"""print(seg_test.subject)
print(seg_test.run)
print(seg_test.segment)
print(seg_test.emg.shape)"""
#seg_test.mean() 10356
k = seg_test.compute_features()
print(k.shape)

(16, 14)


In [48]:
print(segments.shape[0]/5)

45.2


In [49]:
#this cell separets the segments in different lists depending on their types (500,510,etc...)
segment_diff = {}
for segment in segments:
    segment_number = segment.segment
    if segment_number in segment_diff:
        segment_diff[segment_number] = np.vstack((segment_diff[segment_number], segment))
    else:
        segment_diff[segment_number] = segment


"""
for i in [500,510,520,530,540,541,542,550,560]:
    try:
        exec(f'segments_{i} = segment_diff[{i}]')
    except KeyError:
        print("There is no ",i,"segments")
        continue
"""

'\nfor i in [500,510,520,530,540,541,542,550,560]:\n    try:\n        exec(f\'segments_{i} = segment_diff[{i}]\')\n    except KeyError:\n        print("There is no ",i,"segments")\n        continue\n'

In [52]:
print(segment_diff[540].shape)

(46, 1)


In [53]:
nb_channel = 16 
nb_features = 14 
features_500 = np.empty((len(segment_diff[500]),nb_channel, nb_features))
features_510 = np.empty((len(segment_diff[510]),nb_channel, nb_features))
#features_530 = np.empty((len(segment_diff[530]),nb_channel, nb_features))
features_540 = np.empty((len(segment_diff[540]),nb_channel, nb_features))
features_541 = np.empty((len(segment_diff[541]),nb_channel, nb_features))
#features_542 = np.empty((len(segment_diff[542]),nb_channel, nb_features))
#features_550 = np.empty((len(segment_diff[550]),nb_channel, nb_features))
features_560 = np.empty((len(segment_diff[560]),nb_channel, nb_features))

for i in range(len(segment_diff[500])):
    features_500[i]=segment_diff[500][i][0].compute_features()
print(500)
for i in range(len(segment_diff[510])):
    features_510[i]=segment_diff[510][i][0].compute_features()
print(510)



for i in range(len(segment_diff[540])):
    features_540[i]=segment_diff[540][i][0].compute_features()
print(540)

for i in range(len(segment_diff[541])):
    features_541[i]=segment_diff[541][i][0].compute_features()
print(541)


for i in range(len(segment_diff[560])):
    features_560[i]=segment_diff[560][i][0].compute_features()
print(560)


500


  ssi[0][i] = np.sum(emg[i]**2)
  x = um.multiply(x, x, out=x)
  result = np.conjugate(result) * result
  result = np.conjugate(result) * result
  result *= scale
  result[..., 1:-1] *= 2


510


  cond_1 = emg[i][j]*emg[i][j+1]>0


No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


540




No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples
No samples




541
560


In [54]:
print(features_500[0])

[[2.37032672e+02 2.62873098e-02 1.47547254e+01 4.04514907e-02
  1.62431974e-03 9.94898525e-01 7.32061661e-01 1.01459035e+00
  9.30908284e-01 1.52398429e+02 5.00000000e-02 5.00000000e-02
  1.57016094e-02 2.69040426e-03]
 [1.39910213e+02 1.55162707e-02 2.84428383e+00 1.77605099e-02
  1.08790225e-04 9.83586559e-01 7.76089609e-01 9.39019828e-02
  8.43850505e-01 4.91394189e+01 5.00000000e-02 5.00000000e-02
  1.07296787e-03 3.01970512e-04]
 [1.34576381e+02 1.49247401e-02 2.53486161e+00 1.67666420e-02
  7.80591064e-05 9.70389265e-01 7.51136742e-01 1.56628008e-01
  7.89065099e-01 3.76801347e+01 5.00000000e-02 5.00000000e-02
  7.39032921e-04 2.28351881e-04]
 [1.41421998e+02 1.56839301e-02 4.93740767e+00 2.34001402e-02
  4.79121166e-04 9.85028280e-01 7.32172563e-01 4.99189425e-01
  8.73350338e-01 6.98181818e+01 5.00000000e-02 5.00000000e-02
  4.90163526e-03 7.01928803e-04]
 [1.81483726e+02 2.01268411e-02 7.19495942e+00 2.82477031e-02
  7.80745725e-04 9.95120328e-01 7.10435843e-01 3.30215738e-01


### Writting back the data

In [55]:
print(len(features_510))
print(features_510.shape[0])

features_test = np.empty((8,nb_channel, nb_features))
print(len(features_test))
for i in range(9):
    features_test[i]=i
    print(i)

47
47
8
0
1
2
3
4
5
6
7


IndexError: index 8 is out of bounds for axis 0 with size 8

In [56]:
import os  
os.makedirs('data_labels_norma/', exist_ok=True)

In [57]:
label_500 = np.empty((len(segment_diff[500]),1),dtype=str)
label_510 = np.empty((len(segment_diff[510]),1),dtype=str)
#label_530 = np.empty((len(segment_diff[530]),1),dtype=str)
label_540 = np.empty((len(segment_diff[540]),1),dtype=str)
label_541 = np.empty((len(segment_diff[541]),1),dtype=str)
#label_542 = np.empty((len(segment_diff[542]),1),dtype=str)
#label_550 = np.empty((len(segment_diff[550]),1),dtype=str)
label_560 = np.empty((len(segment_diff[560]),1),dtype=str)

In [58]:
from sklearn import preprocessing

del_index=[]
for i in range(len(features_500)):
    try :
        features_500[i] = preprocessing.normalize(features_500[i], axis=0, copy=False, return_norm=False)
    except ValueError:
        print("Value Error")
        del_index.append(i)

for i in range(label_500.shape[0]):
    label_500[i][0]=segment_diff[500][i][0].level
    
label_500 = np.delete(label_500,del_index, axis=0)

df = pd.DataFrame(label_500)
print("del", del_index)
#df=df.drop(index=del_index)
#df.reset_index(drop=True)
df.to_csv(f'data_labels_norma/label_500.csv',index=False)

features_500 = np.delete(features_500,del_index, axis=0) 

del_index=[]
for i in range(len(features_510)):
    try :
        features_510[i] = preprocessing.normalize(features_510[i], axis=0, copy=False, return_norm=False)
    except ValueError:
        print("Value Error")
        del_index.append(i)
        
for i in range(label_510.shape[0]):
    label_510[i][0]=segment_diff[510][i][0].level
    
label_510 = np.delete(label_510,del_index, axis=0)

df = pd.DataFrame(label_510)
print("del", del_index)
#df=df.drop(index=del_index)
#df.reset_index(drop=True)
df.to_csv(f'data_labels_norma/label_510.csv',index=False)

features_510 = np.delete(features_510,del_index, axis=0) 


del_index=[]
for i in range(len(features_540)):
    try :
        features_540[i] = preprocessing.normalize(features_540[i], axis=0, copy=False, return_norm=False)
    except ValueError:
        print("Value Error")
        del_index.append(i)
        
for i in range(label_540.shape[0]):
    label_540[i][0]=segment_diff[540][i][0].level
    
label_540 = np.delete(label_540,del_index,axis=0)
df = pd.DataFrame(label_540)
print("del", del_index)
#df=df.drop(index=del_index)
#df.reset_index(drop=True)
df.to_csv(f'data_labels_norma/label_540.csv',index=False)

features_540 = np.delete(features_540,del_index, axis=0)  

del_index=[]        
for i in range(len(features_541)):
    try :
        features_541[i] = preprocessing.normalize(features_541[i], axis=0, copy=False, return_norm=False)    
    except ValueError:
        print("Value Error")
        del_index.append(i)
        
for i in range(label_541.shape[0]):
    label_541[i][0]=segment_diff[541][i][0].level
label_541 = np.delete(label_541,del_index,axis=0)
df = pd.DataFrame(label_541)
#df=df.drop(index=del_index)
print("del 41", del_index)
#df.reset_index(drop=True)
df.to_csv(f'data_labels_norma/label_541.csv',index=False) 

features_541 = np.delete(features_541,del_index, axis=0) 




del_index=[]        
for i in range(len(features_560)):
    try :
        features_560[i] = preprocessing.normalize(features_560[i], axis=0, copy=False, return_norm=False)
    except ValueError:
        print("Value Error")
        del_index.append(i)

for i in range(label_560.shape[0]):
    label_560[i][0]=segment_diff[560][i][0].level

label_560 = np.delete(label_560,del_index,axis=0)
df = pd.DataFrame(label_560)
print("del", del_index)
#df=df.drop(index=del_index)
#df.reset_index(drop=True)
df.to_csv(f'data_labels_norma/label_560.csv',index=False)        
        
features_560 = np.delete(features_560,del_index, axis=0)

del []
Value Error
del [34]
Value Error
Value Error
del [1, 11]
Value Error
Value Error
del 41 [11, 27]
Value Error
Value Error
del [15, 32]


In [59]:

os.makedirs('data_features_norma/run_500', exist_ok=True) 
for i in range(features_500.shape[0]):
    df = pd.DataFrame(features_500[i], index = range(16),columns = range(14))
    df.to_csv(f'data_features_norma/run_500/run_500_{i}.csv',index=False)  
    
os.makedirs('data_features_norma/run_510', exist_ok=True) 
for i in range(features_510.shape[0]):
    df = pd.DataFrame(features_510[i], index = range(16),columns = range(14))
    df.to_csv(f'data_features_norma/run_510/run_510_{i}.csv',index=False)
    
    
os.makedirs('data_features_norma/run_540', exist_ok=True) 
for i in range(features_540.shape[0]):
    df = pd.DataFrame(features_540[i], index = range(16),columns = range(14))
    df.to_csv(f'data_features_norma/run_540/run_540_{i}.csv',index=False)
    
os.makedirs('data_features_norma/run_541', exist_ok=True) 
for i in range(features_541.shape[0]):
    df = pd.DataFrame(features_541[i], index = range(16),columns = range(14))
    df.to_csv(f'data_features_norma/run_541/run_541_{i}.csv',index=False)
    
os.makedirs('data_features_norma/run_560', exist_ok=True) 
for i in range(features_560.shape[0]):
    df = pd.DataFrame(features_560[i], index = range(16),columns = range(14))
    df.to_csv(f'data_features_norma/run_560/run_560_{i}.csv',index=False)

In [60]:
test = np.array([1,2,3])
print(test)
ok =[]
test=np.delete(test,ok, axis=0)
print(test)
print(features_510.shape[0])
print(len(features_510))

[1 2 3]
[1 2 3]
46
46


In [61]:
print(segment_diff[500].shape)
print(segments[0].level)

(45, 1)
intermediate
