# Notebook EDF Converstion and Initial Pipeline 

### initial thought process on steps to follow with pulling in data
- pull in metadata and convert from EDF to pd: 
    - ``|user_id|device_id|disease|personal_information|``
- access respective user_id data folder
- pull in data from prespective device_id EDF data file 
    - ``|user_id|device_id|type|time|value|``
- convert EDF to pandas dataframe
- merge required files together (for respective calculation)

In [1]:
import sys
import getpass
from pathlib import Path

import pandas as pd
import numpy as np
import scipy as sp

import matplotlib.pyplot as plt

import pyedflib

In [9]:
user = getpass.getuser()
data_path = '/Users/{}/GitHub/fydp-data-science/data/ondri_data/'.format(user)

### EDF Conversion

#### Use this function to identify what signals are in an EDF function

In [66]:
def identify_signals(data_path, filename):
    
    # read the EDF file
    file = pyedflib.EdfReader(data_path + filename)
    n = file.signals_in_file
    signal_labels = file.getSignalLabels()
    
    # close file
    file.close()
    
    print (signal_labels)
    
    return signal_labels

#### Use this function to pull selected signals from EDF file
 - signals -- an array of indicies of the signals we want to pull

In [77]:
def convert_data_file(data_path, filename, signals):    
    
    # read the EDF file
    file = pyedflib.EdfReader(data_path + filename)
    signal_labels = file.getSignalLabels()
    
    selected_labels = []
    for i in signals:
        selected_labels.append(signal_labels[i])
    
    # format based on signals selected
    n = file.signals_in_file
    sigbufs = np.zeros((n, len(file.readSignal(i))))

    for i in signals:
        sigbufs[i,:] = file.readSignal(i)
        
    # close file
    file.close()
        
    # create dataframe
    df = pd.DataFrame(columns=selected_labels)
    for i in np.arange(0,len(selected_labels)):
        label = selected_labels[i]
        df[label] = sigbufs[i]
        
    return df

### Converting Files

#### *ECG data*

In [68]:
filename = 'bittium.EDF'
signals = identify_signals(data_path, filename)

['ECG', 'Accelerometer_X', 'Accelerometer_Y', 'Accelerometer_Z', 'Marker', 'HRV']


In [78]:
ecg = convert_data_file(data_path, filename, [0])
ecg.head()

Unnamed: 0,ECG
0,19.0
1,4.0
2,5.0
3,-3.0
4,-23.0


In [79]:
ecg.to_csv(data_path + 'ecg_data.csv')

#### *Accelerometer data*

In [80]:
filename = 'geneactiv.EDF'
signals = identify_signals(data_path, filename)

['x', 'y', 'z']


In [81]:
wrist_acc = convert_data_file(data_path, filename, [0,1,2])
wrist_acc.head()

Unnamed: 0,x,y,z
0,-0.050722,-0.041538,-1.015312
1,-0.062537,-0.045464,-1.007386
2,-0.062537,-0.041538,-1.007386
3,-0.046783,-0.049391,-0.999459
4,-0.046783,-0.053318,-1.007386


In [82]:
wrist_acc.to_csv(data_path + 'acc_data.csv')

### Merging Function

In [65]:
# pass in two dfs at a time to merge
def merge_files(master_df, df):
    # assuming the master df is in correct folder
    
    # prep the df so that it is in the correct format for merging
    df = df.rename(columns={'value':df.type.unique()[0]})
    df = df.drop(['type','device_id'], 1)
   
    # might have to sort the time indices, but that depends on how they are 
    # formatted and the likelihood of differences across devices
    merge_df = pd.merge(master_df,
                        df,
                        left_on=['user_id','time'], 
                        right_on=['user_id','time'],
                        how='outer')
    
    return merge_df