In [35]:
import scipy.io
import os
import glob 
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np

#edit local path
path = "dataset/*"

plt.rcParams['figure.figsize'] = [20, 10]


# Notes

### short description of each column in the file, given minor investigation
    
    - '__header__' irrelevant and should be dropped
    - '__version__' irrelevant and should be dropped
    - '__globals__' unsure, might be variables in the .mat file
    - 'handle_arrow_rand' randomness array? just and array of 1s and 2s 
    - 'no_movements' holds array with integer 30
    - 'time_cue_on' formatted as a array of floats -> its a date and needs reformatting // 30 timestamps
    - 'time_cue_off' formatted as array of floats -> its a date and needs reformatting // 1219 timestamps
    - 'TriggerPoint' holds 61 timestamps
    - 'delay_T1' holds a constant 3 
    - 'delay_random_T1' holds a constant 2
    - 'delay_T2' holds a constant 0.1
    - 'sample_rate' likely Hz // holds constant 1200
    - 'time_window' holds a constant 3
    - 'no_time_windows' holds a constant 1000
    - 'filter_code_eeg' holds a constant 38 (could refer to what type of filter e.g. butterworth?)
    - 'time_start_device1' holds timestamp for when recording began
    - 'time_after_first_window' holds timestamp
    - 'time_after_last_window' holds timestamp
    - 'time_stop_device1' holds timestamp of when recording ended
    - 'data_device1' dataframe of 16 columns (likely 16 channels?, however mail stated 12 channels + 13th being EMG) size 405824x16 for cue_set1 and 446912x16 for cue_set2
    - 'time_axis_all_device1' potentially holds timestamps from recording start? has 1x402224 and 1x443312 shape for cue_set1 and cue_set2 respectively. (number of indices almost line up with data_device1)


### Load data

In [36]:
cue_sets = []

for file in glob.glob(path, recursive=True):
    cue_sets.append(scipy.io.loadmat(file))

In [37]:
cue_set_01 = cue_sets[0]
cue_set_02 = cue_sets[1]

In [38]:
counter = 1
for cue_set in cue_sets:
    
    print(f' # # - cue_set_{counter} - # # ')
    keys = cue_set.keys()
        
    data = []
    for key in keys:
        data.append(cue_set[key])
    
    counter += 1 


 # # - cue_set_1 - # # 
 # # - cue_set_2 - # # 


In [39]:
data_pd = pd.DataFrame(cue_set_01['data_device1'])

data_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.000258,0.000272,0.000064,0.000609,0.000055,0.000257,-0.000007,-0.000099,-0.000308,0.000339,0.000280,0.000032,0.000079,0.000185,0.000011,0.000299
1,0.052644,0.007779,-0.013178,-0.011900,-0.009890,0.004895,-0.011826,-0.014831,-0.051013,0.052730,0.143087,0.052914,-0.034878,-0.053215,-0.020476,0.021011
2,-0.018937,-0.006850,-0.014828,-0.012735,-0.016893,-0.011796,-0.016083,-0.008713,-0.001338,-0.008554,0.177096,0.185288,0.006653,-0.047167,-0.014965,-0.040797
3,-0.017449,-0.009423,-0.020752,-0.019010,-0.024620,-0.015835,-0.023497,-0.013807,-0.008897,-0.002519,0.246563,0.245063,-0.026257,-0.035837,-0.030295,-0.040202
4,-0.019193,-0.009692,-0.021968,-0.019929,-0.025658,-0.016586,-0.024548,-0.014334,-0.008702,-0.003762,0.260248,0.259745,-0.043538,-0.024397,-0.029768,-0.046494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
405819,-0.017999,-0.009966,-0.018244,-0.020071,-0.026619,-0.016393,-0.022933,-0.012371,-0.009561,-0.004790,0.247545,0.247177,0.058031,-0.138931,0.000381,-0.088700
405820,-0.018010,-0.009973,-0.018249,-0.020080,-0.026629,-0.016405,-0.022943,-0.012383,-0.009570,-0.004801,0.247942,0.247574,0.048608,-0.129188,-0.002052,-0.085681
405821,-0.018020,-0.009980,-0.018257,-0.020077,-0.026634,-0.016416,-0.022952,-0.012391,-0.009573,-0.004793,0.249757,0.249391,0.036568,-0.112828,-0.007677,-0.077072
405822,-0.018037,-0.009989,-0.018272,-0.020077,-0.026638,-0.016420,-0.022953,-0.012395,-0.009573,-0.004794,0.251253,0.250888,0.019876,-0.093358,-0.013501,-0.069212


### Investigating the data

Plotting everything at once is obviously impossible to analyse and gauge. 

In [40]:
plt.figure()
data_pd.iloc[0:100].plot()

<AxesSubplot:>

However, taking random subset of the data shows that majority of channels are unused. Experiment further with this to validate findings. 

Channels 0, 1, 12, 13, 14 and 15 seem to be the only relevant ones. (Channel 0 follows channel 1 almost perfectly, so could be redudant)

It was stated in the e-mail that channel was EGM, that however doesnt really seem to add up here as these look like regular EEG values. 


In [25]:
plt.figure()
data_pd.iloc[0:100].plot()

<AxesSubplot:>

In [26]:
plt.figure()
data_pd.iloc[100000:100100].plot()

<AxesSubplot:>

## Finding a on cue time


trying to find a timeframe of whenever a cue appears in the dataset based on the timestamps given.

In [27]:
# Given we're recoding a 1200 Hz - we conclude that we can divide row numbers by sample_frequency and get the timeframe in seconds
frequency = cue_set_01['sample_rate'][0][0]

sec_recoding = data_pd.shape[0] / frequency 

sec_recoding

338.18666666666667

In [28]:
time_start = cue_set_01['time_start_device1']
cue_on = cue_set_01['time_cue_on']
cue_off = cue_set_01['time_cue_off']

cue_01 = np.subtract(cue_on[0], time_start)
cue_01

array([[ 0.   ,  0.   ,  0.   ,  0.   ,  0.   , 23.996]])

In [29]:
cue_01_off = np.subtract(cue_off[0], time_start)
cue_01_off

array([[ 0.   ,  0.   ,  0.   ,  0.   ,  0.   , 28.646]])

Based on these two values we try and crop the data in this timeframe

In [30]:
start = int(cue_01[0][-1] * frequency)
end = int(cue_01_off[0][-1] * frequency) 

In [31]:
plt.figure()
data_pd.iloc[start:end].plot() 

<AxesSubplot:>

It is very difficult to see anything, instead we break it down in individual channels.

In [32]:
# plt.rcParams['figure.figsize'] = [300, 10]


for column in data_pd:
    plt.figure()
    plt.title(f'Channel: {column}')
    data_pd[column].iloc[start:end].plot() 


  plt.figure()


In [33]:
# TODO find the average time between cue_on and cue_off to see how long each timeframe lasts for. 

In [34]:
#zoomed version

middle = int((start+end)/2)
for column in data_pd:
    plt.figure()
    plt.title(f'Channel: {column}')
    data_pd[column].iloc[(middle-50):(middle+50)].plot() 