# Notebook to generate table-based data for model injection.

In [1]:
#Uncomment the next two lines if your are using Google Colab

#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
data_folder = '/content/drive/Shareddrives/Shared_Stefano_Vicenzo/Acquisitions/Data/TVS'
data_format = 'data.mat'

In [None]:
!pip install matgrab --quiet

In [None]:
!pip install sktime --quiet

In [None]:
import scipy.io as spio
import os
import pandas as pd
import numpy as np
import math
import itertools

import gc

gc.enable()

In [None]:
def loadmat(filename):
    '''
    this function should be called instead of direct spio.loadmat
    as it cures the problem of not properly recovering python dictionaries
    from mat files. It calls the function check keys to cure all entries
    which are still mat-objects
    '''
    #print(filename)
    data = spio.loadmat(filename, struct_as_record=False, squeeze_me=True)
    return _check_keys(data)

def _check_keys(dict):
    '''
    checks if entries in dictionary are mat-objects. If yes
    todict is called to change them to nested dictionaries
    '''
    for key in dict:
        if isinstance(dict[key], spio.matlab.mio5_params.mat_struct):
            dict[key] = _todict(dict[key])
    return dict

def _todict(matobj):
    '''
    A recursive function which constructs from matobjects nested dictionaries
    '''
    dict = {}
    for strg in matobj._fieldnames:
        elem = matobj.__dict__[strg]
        if isinstance(elem, spio.matlab.mio5_params.mat_struct):
            dict[strg] = _todict(elem)
        else:
            dict[strg] = elem
    return dict

def print_mat_nested(d, indent=0, nkeys=0):
    # Subset dictionary to limit keys to print.  Only works on first level
    if nkeys>0:
        d = {k: d[k] for k in list(d.keys())[:nkeys]}  # Dictionary comprehension: limit to first nkeys keys.

    if isinstance(d, dict):
        for key, value in d.items():         # iteritems loops through key, value pairs
          print('\t' * indent + 'Key: ' + str(key))
          print_mat_nested(value, indent+1)

    if isinstance(d,np.ndarray) and d.dtype.names is not None:  # Note: and short-circuits by default
        for n in d.dtype.names:    # This means it's a struct, it's bit of a kludge test.
            print('\t' * indent + 'Field: ' + str(n))
            print_mat_nested(d[n], indent+1)

In [None]:
def filter_values(context_file, df, num_ts):
  context = context_file['contextValues']
  start = 0
  end = 0
  filtered_ctx = []
  for i,t in enumerate(context.keys()):
    if int(t)>=math.trunc(df['Timestamp (ms)'].iloc[0]) and int(t)<=math.trunc(df['Timestamp (ms)'].iloc[-1]):
      filtered_ctx.append([int(t), context[t][0], context[t][1], context[t][2], context[t][3]])

  filtered_ctx = list(itertools.chain.from_iterable(itertools.repeat(x, 100) for x in filtered_ctx))[:num_ts]
  return filtered_ctx

In [None]:
def checkIndoor(filtered_ctx, staypts):
  indoor=[]
  for j,el in enumerate(filtered_ctx):
    #stay_id.append(filtered_ctx[j][2])
    if filtered_ctx[j][1]!=50:
      if filtered_ctx[j][1]>50:
        indoor.append(1)
      else:
        indoor.append(0)
    else:
      indoor.append(0.5)
  return indoor

In [None]:
def preprocess_subject(df):
  for c in df.columns:
    if not ('Mag' in c or c.startswith('T') or c=='Indoor Probability' or c=='Patient ID'):
      df.drop(columns=c, inplace=True)

  col_names = {'LowerBack': 'LB', 'LeftFoot': 'LF', 'RightFoot':'RF', 'Wrist':'WR'}

  for sensor in col_names.keys():
    sens_cols = [col for col in df.columns if sensor in col]
    df[f'Mag{sensor}_Norm'] = np.linalg.norm(df[sens_cols].values,axis=1)

  for c in df.columns:
    for k in col_names.keys():
      if k in c:
        if 'Norm' in c:
          ax = c.split('_')[-1]
        else:
          ax = c.split('_')[-1].lower()[0]
        df.rename(columns={c:f'Mag{col_names[k]}_{ax}'}, inplace=True)

    if 'Timestamp' in c:
      df.rename(columns={c:'Timestamp'}, inplace=True)
    elif 'Indoor' in c:
      df.rename(columns={c:'Indoor'}, inplace=True)
    elif 'Patient' in c:
      df.rename(columns={c:'Patient'}, inplace=True)

  df.reset_index(inplace=True, drop=True)
  return df

In [None]:
def makeData(data_folder, data_format, ts=997417, verbose=False):
  #i=0
  full_df = pd.DataFrame()
  for folder in os.listdir(data_folder):
    for i,patient in enumerate(os.listdir(os.path.join(data_folder, folder))):
      try:
        print(f"Processing patient id: {patient}")
        if os.path.isfile(f'/content/drive/Shareddrives/Shared_Stefano_Vicenzo/Code/Data/df_{patient}.csv'):
          print("Existing file for ID: ", patient)

        else:
          #-------------LOAD DATA-------------
          filemat = os.path.join(data_folder, folder, patient, 'Out of Lab', data_format )
          matdata = loadmat(filemat)
          recording = matdata['data']['TimeMeasure1']['Recording4']
          if verbose:
            print_mat_nested(recording)

          #---------CREATE RAW DATASET--------
          df = pd.json_normalize(recording)

          df_triaxial = pd.DataFrame()
          for c in df.columns:
            if 'Acc' in c and 'Fs' not in c:
              df_triaxial[f'{c}_X (g)'] = pd.Series([v[0] for v in df[c].values[0]])
              df_triaxial[f'{c}_Y (g)'] = pd.Series([v[1] for v in df[c].values[0]])
              df_triaxial[f'{c}_Z (g)'] = pd.Series([v[2] for v in df[c].values[0]])
            elif 'Gyr' in c and 'Fs' not in c:
              df_triaxial[f'{c}_X (deg/s)'] = pd.Series([v[0] for v in df[c].values[0]])
              df_triaxial[f'{c}_Y (deg/s)'] = pd.Series([v[1] for v in df[c].values[0]])
              df_triaxial[f'{c}_Z (deg/s)'] = pd.Series([v[2] for v in df[c].values[0]])
            elif 'Mag' in c and 'Fs' not in c:
              df_triaxial[f'{c}_X (uT)'] = pd.Series([v[0] for v in df[c].values[0]])
              df_triaxial[f'{c}_Y (uT)'] = pd.Series([v[1] for v in df[c].values[0]])
              df_triaxial[f'{c}_Z (uT)'] = pd.Series([v[2] for v in df[c].values[0]])

          df_triaxial['Timestamp (ms)'] = recording['SU_INDIP']['LowerBack']['Timestamp']
          #---------FILTER RAW DATASET--------
          indoor = []
          stay_id = []
          for ctxs in os.listdir(os.path.join(data_folder, folder, patient,
                                              'Out of Lab/Contextual Factors')):
            if ctxs.startswith('stay'):
              staypts = pd.read_json(os.path.join(data_folder, folder, patient,
                                    'Out of Lab/Contextual Factors', ctxs))['data']
            elif ctxs.startswith('per'):
              ctx = pd.read_json(os.path.join(data_folder, folder, patient,
                                    'Out of Lab/Contextual Factors', ctxs))['data'][0]
            elif ctxs.startswith('path'):
              path = pd.read_json(os.path.join(data_folder, folder, patient,
                                    'Out of Lab/Contextual Factors', ctxs))

          filtered_ctx = filter_values(ctx, df_triaxial, ts)
          df_triaxial['Indoor Probability'] = pd.Series(checkIndoor(filtered_ctx, staypts))


          df_triaxial['Patient ID'] = pd.Series([patient for _ in range(len(df_triaxial))])
          df_triaxial['Disease'] = pd.Series([folder for _ in range(len(df_triaxial))])
          print(len(df_triaxial))
          #df = pd.DataFrame(df)
          #df_triaxial.set_index('Timestamp (ms)', inplace=True)

          #if len(df_triaxial) > 0.5*ts:
          print('Total len: ', len(df_triaxial))
          print('Total null: ', df_triaxial[df_triaxial['Indoor Probability']==0.5]['Indoor Probability'].sum())
          print('Total indoor: ', df_triaxial[df_triaxial['Indoor Probability']==1]['Indoor Probability'].sum())
          print('Total null percentage: ', df_triaxial[df_triaxial['Indoor Probability']==0.5]['Indoor Probability'].sum()/len(df_triaxial)*100)
          print('Total indoor percentage: ', df_triaxial[df_triaxial['Indoor Probability']==1]['Indoor Probability'].sum()/len(df_triaxial)*100)
          print('Saving patient CSV file...')
          df_triaxial = preprocess_subject(df_triaxial)
          df_triaxial.to_csv(f'/content/drive/Shareddrives/Shared_Stefano_Vicenzo/Code/Data/df_{patient}.csv')
          print('Saved!')
          #else:
          #  print('Too few samples for ID: ', patient)
      except:
          print(f'Corrupted patient file: {patient}')
          continue
      #print('--------------------------')
  return full_df

In [None]:
df = makeData(data_folder, data_format)