# Notebook initial pipeline and signal processing 

## initial thought process on steps to follow with pulling in data
- pull in metadata and convert from EDF to pd: 
    - ``|user_id|device_id|disease|personal_information|``
- access respective user_id data folder
- pull in data from prespective device_id EDF data file 
    - ``|user_id|device_id|type|time|value|``
- convert EDF to pandas dataframe
- merge required files together (for respective calculation)

In [2]:
import sys
import getpass
from pathlib import Path

import pandas as pd
import numpy as np
import scipy as sp

import matplotlib.pyplot as plt

import pyedflib

In [44]:
user = getpass.getuser()
data_path = '/Users/{}/Desktop/GitHub/fydp-data-science/sample_files/'.format(user)

# hasn't been created yet -- either develop own or get from ONDRI
metadata = data_path + 'metadata.edf'

In [53]:
user = 'user_001'
user_folder = data_path + user

In [47]:
def convert_data_file(data_path, filename):    
    # pull data out of the EDF file
    file = pyedflib.EdfReader(data_path + filename)
    n = file.signals_in_file
    signal_labels = file.getSignalLabels()
    sigbufs = np.zeros((n, file.getNSamples()[0]))

    for i in np.arange(n):
        sigbufs[i,:] = file.readSignal(i)
        
    # close file
    file.close()
        
    # create dataframe
    df = pd.DataFrame(columns=signal_labels)
    for i in np.arange(0,len(signal_labels)):
        label = signal_labels[i]
        df[label] = sigbufs[i]
        
    return df

In [63]:
# currently just looking at random sample file with only one col
d = convert_data_file(user_folder+'/raw_data/', 'dX.edf')

# making it into the basic format I would expect to pull out of EDF
d['user_id'] = user
d['device_id'] = 'device_001'
d['type'] = 'dX'
d['time'] = d.index

d = d.rename(columns={d.columns[0]:'value'})
d

Unnamed: 0,value,user_id,device_id,type,time
0,-2.000000,user_001,device_001,dX,0
1,-1.007812,user_001,device_001,dX,1
2,-1.001953,user_001,device_001,dX,2
3,-1.001953,user_001,device_001,dX,3
4,-1.004883,user_001,device_001,dX,4
...,...,...,...,...,...
6914555,-1.035156,user_001,device_001,dX,6914555
6914556,-1.009766,user_001,device_001,dX,6914556
6914557,-1.005859,user_001,device_001,dX,6914557
6914558,-1.019531,user_001,device_001,dX,6914558


In [65]:
# pass in two dfs at a time to merge
def merge_files(master_df, df):
    # assuming the master df is in correct folder
    
    # prep the df so that it is in the correct format for merging
    df = df.rename(columns={'value':df.type.unique()[0]})
    df = df.drop(['type','device_id'], 1)
   
    # might have to sort the time indices, but that depends on how they are 
    # formatted and the likelihood of differences across devices
    merge_df = pd.merge(master_df,
                        df,
                        left_on=['user_id','time'], 
                        right_on=['user_id','time'],
                        how='outer')
    
    return merge_df