Data is strucutured as follow:
An experiment is performed on multiple cells. To increase certainty the experiment is performed multiple times (~3-4). \
During each repetition of the experience, different protocols are applied, each composed of multiple steps. Finally the results are analysed and data regarding the neuron electrical answer are generated. Those are mostly composed of spikes and inter spikes features. \
To sythetize, the data are structured as follow: Experiment/cell/repetitions/protocol/step/spike. \
As the data have to be considered at cell level we will not consider the experiment data (except for debugging)

# Librairy importation and data load

In [2]:
import os
import pandas as pd
from scipy import io


In [4]:
loaded_sp = io.loadmat('../data/01_raw/matData_2022-05-30/aCell83_1.mat')

# Functions

In [5]:
# def flatten_df(df, levels):
#     """Flatten all the columns of a datafram to facilitate data extraction
    
#     df: Dataframe to flatten
#     levels: number of wished flattening levels
#     """
#     df_flat = df.copy()
#     count = 0
#     while count<levels:
#         df_flat = df_flat.apply(lambda x: x[0])
#         count+=1
#     return df_flat

def flatten_df(df):
    """Flatten all the columns of a datafram to facilitate data extraction
    
    df: Dataframe to flatten
    """
    df_flat = df.copy()
    df_flat = df_flat.apply(lambda x: x[0][0])

    return df_flat


# Cell level data (+experiment)

In [6]:
loaded_sp.keys()
# Only aCell contains relevant information. We will not consider the other cells

dict_keys(['__header__', '__version__', '__globals__', 'aCell'])

#### First branching (but not the latest :). 
- Id simply provides the experiment ID
- Cellinfo provides the cell data
- FileInfo provides the document data (useless)
- protocol provides all theinformations regarding the stimulus and the experimental results

In [7]:
loaded_sp['aCell'][0][0].dtype

dtype([('id', 'O'), ('cellInfo', 'O'), ('protocol', 'O'), ('fileInfo', 'O')])

In [8]:
# fileInfo_temp = pd.DataFrame(loaded_sp['aCell'][0][0]['fileInfo'].ravel())
cellInfo_temp = pd.DataFrame(loaded_sp['aCell'][0][0]['cellInfo'].ravel())

                          

In [9]:
# Extract the cell info
cellInfo_flat = flatten_df(cellInfo_temp)
cellInfo_flat

id                                                     83_1
experimenter                                             OH
species                                               Mouse
channel                                                 ch2
cellType                                              SNcDA
path            \001_140304A03_OH\001_140304A3MMP16SNcDA_OH
status                                                  [1]
dtype: object

# Extraction of data at protocol level

In [16]:
# for i in loaded_sp['aCell'][0][0]['protocol'][0]:
#     print(i[0][0][0][0])
protocol_raw = loaded_sp['aCell'][0][0]['protocol'][0][0]
protocol_raw[0][0][0][0]#['APWaveform']

'APWaveform'

In [17]:
for i in loaded_sp['aCell'][0][0]['protocol'][0]:
    print(i[0][0][0][0])

APWaveform
DeHyperPol
Delta
ElecCal
FirePattern
HyperDePol
IDRest
IDThres
IV
NegCheops
PosCheops
RPip
Rac
ResetITC
SetAmpl
SetISI
SineSpec
SponHold30
SponHold3
SponNoHold30
StartHold
StartNoHold
SubWhiteNoise
TestAmpl
TestRheo
sAHP


In [18]:
protocol_raw[0][0][0][0]#['APWaveform']

'APWaveform'

#### APWaveform (Illustrative)

In [19]:
# protocol_raw[0][0][0][0]

In [20]:
protocol_raw[0][0][0][0]#['APWaveform']

'APWaveform'

In [21]:
protocol_df = pd.DataFrame(protocol_raw[0][0][1][0])
protocol_df['protocol_name'] = protocol_raw[0][0][0][0]
protocol_df = protocol_df.reset_index().rename(columns={'index': 'repetition'})

protocol_df


Unnamed: 0,repetition,nTrace,ids,stim,stim_actual,stim_ids,stim_change,vHold,stim_start,stim_end,...,AP_rise_time,AP_fall_time,time_to_AP_peak,min_cur_for_discharge,AHP_duration,AHP_fall_tau,AHP_fall_A,AHP_rise_m,AHP_rise_c,protocol_name
0,0,[[6]],"[[1061, 1062, 1063, 1064, 1065, 1066]]","[[60, 100, 140, 180, 220, 260]]","[[157.20000000000002, 262.0, 366.8, 471.6, 576...","[[1, 12501, 40627, 53124]]","[[0.0, 250.0, 812.5200000000001, 1062.46000000...","[[[[-65.32608302]], [[-69.4267259]], [[-71.223...",[[250]],[[812.5200000000001]],...,"[[[[]], [[]], [[]], [[1.02820008]], [[0.942560...","[[[[]], [[]], [[]], [[2.22338669]], [[1.965385...","[[[[]], [[]], [[]], [[446.32]], [[145.1 256.2...",[[[[471.6]]]],"[[[[]], [[]], [[]], [[]], [[104.22]], [[60.96 ...","[[[], [], [], [[1.55071061]], [[0.91632396 1.2...","[[[], [], [], [[19.21890676]], [[19.67356072 2...","[[[], [], [], [], [[0.12885464]], [[0.23297097...","[[[], [], [], [], [[-94.50044549]], [[-126.092...",APWaveform
1,1,[[6]],"[[2029, 2030, 2031, 2032, 2033, 2034]]","[[60, 100, 140, 180, 220, 260]]","[[206.4, 344.0, 481.59999999999997, 619.2, 756...","[[1, 12501, 40627, 53124]]","[[0.0, 250.0, 812.5200000000001, 1062.46000000...","[[[[-66.74095101]], [[-71.31555915]], [[-73.01...",[[250]],[[812.5200000000001]],...,"[[[[]], [[]], [[0.96639981]], [[0.94820008 1.0...","[[[[]], [[]], [[2.00162873]], [[1.96450179 2.1...","[[[[]], [[]], [[151.38]], [[122.58 170.2 283....",[[[[481.6]]]],"[[[[]], [[]], [[]], [[ 40.78 105.5 ]], [[34.18...","[[[], [], [[0.97340608]], [[0.76206102 1.02010...","[[[], [], [[17.4269363]], [[16.77670915 17.112...","[[[], [], [], [[0.33879904 0.11556989]], [[0.4...","[[[], [], [], [[-168.9938747 -86.02544872]],...",APWaveform


# Extraction of data at trace level

In [22]:
trace_level_columns = [
    'ids',
'stim',
'stim_actual',
'vHold',
'peak_indices',
'spikecount',
'peak_time',
'peak_voltage',
'ISI_values',
'min_AHP_indices',
'min_AHP_time',
'min_AHP_voltage',
'AP_begin_voltage',
'AP_begin_time',
'AP_amplitude',
]

In [23]:
trace_df = protocol_df.copy()
# trace_df[trace_level_columns] = trace_df[trace_level_columns].apply(lambda x: x.)
trace_df = trace_df.explode(trace_level_columns).explode(trace_level_columns)

In [24]:
trace_df.head()

Unnamed: 0,repetition,nTrace,ids,stim,stim_actual,stim_ids,stim_change,vHold,stim_start,stim_end,...,AP_rise_time,AP_fall_time,time_to_AP_peak,min_cur_for_discharge,AHP_duration,AHP_fall_tau,AHP_fall_A,AHP_rise_m,AHP_rise_c,protocol_name
0,0,[[6]],1061,60,157.2,"[[1, 12501, 40627, 53124]]","[[0.0, 250.0, 812.5200000000001, 1062.46000000...",[[-65.32608302202645]],[[250]],[[812.5200000000001]],...,"[[[[]], [[]], [[]], [[1.02820008]], [[0.942560...","[[[[]], [[]], [[]], [[2.22338669]], [[1.965385...","[[[[]], [[]], [[]], [[446.32]], [[145.1 256.2...",[[[[471.6]]]],"[[[[]], [[]], [[]], [[]], [[104.22]], [[60.96 ...","[[[], [], [], [[1.55071061]], [[0.91632396 1.2...","[[[], [], [], [[19.21890676]], [[19.67356072 2...","[[[], [], [], [], [[0.12885464]], [[0.23297097...","[[[], [], [], [], [[-94.50044549]], [[-126.092...",APWaveform
0,0,[[6]],1062,100,262.0,"[[1, 12501, 40627, 53124]]","[[0.0, 250.0, 812.5200000000001, 1062.46000000...",[[-69.42672589539576]],[[250]],[[812.5200000000001]],...,"[[[[]], [[]], [[]], [[1.02820008]], [[0.942560...","[[[[]], [[]], [[]], [[2.22338669]], [[1.965385...","[[[[]], [[]], [[]], [[446.32]], [[145.1 256.2...",[[[[471.6]]]],"[[[[]], [[]], [[]], [[]], [[104.22]], [[60.96 ...","[[[], [], [], [[1.55071061]], [[0.91632396 1.2...","[[[], [], [], [[19.21890676]], [[19.67356072 2...","[[[], [], [], [], [[0.12885464]], [[0.23297097...","[[[], [], [], [], [[-94.50044549]], [[-126.092...",APWaveform
0,0,[[6]],1063,140,366.8,"[[1, 12501, 40627, 53124]]","[[0.0, 250.0, 812.5200000000001, 1062.46000000...",[[-71.22377162143096]],[[250]],[[812.5200000000001]],...,"[[[[]], [[]], [[]], [[1.02820008]], [[0.942560...","[[[[]], [[]], [[]], [[2.22338669]], [[1.965385...","[[[[]], [[]], [[]], [[446.32]], [[145.1 256.2...",[[[[471.6]]]],"[[[[]], [[]], [[]], [[]], [[104.22]], [[60.96 ...","[[[], [], [], [[1.55071061]], [[0.91632396 1.2...","[[[], [], [], [[19.21890676]], [[19.67356072 2...","[[[], [], [], [], [[0.12885464]], [[0.23297097...","[[[], [], [], [], [[-94.50044549]], [[-126.092...",APWaveform
0,0,[[6]],1064,180,471.6,"[[1, 12501, 40627, 53124]]","[[0.0, 250.0, 812.5200000000001, 1062.46000000...",[[-72.37195345225686]],[[250]],[[812.5200000000001]],...,"[[[[]], [[]], [[]], [[1.02820008]], [[0.942560...","[[[[]], [[]], [[]], [[2.22338669]], [[1.965385...","[[[[]], [[]], [[]], [[446.32]], [[145.1 256.2...",[[[[471.6]]]],"[[[[]], [[]], [[]], [[]], [[104.22]], [[60.96 ...","[[[], [], [], [[1.55071061]], [[0.91632396 1.2...","[[[], [], [], [[19.21890676]], [[19.67356072 2...","[[[], [], [], [], [[0.12885464]], [[0.23297097...","[[[], [], [], [], [[-94.50044549]], [[-126.092...",APWaveform
0,0,[[6]],1065,220,576.4,"[[1, 12501, 40627, 53124]]","[[0.0, 250.0, 812.5200000000001, 1062.46000000...",[[-72.89302930550245]],[[250]],[[812.5200000000001]],...,"[[[[]], [[]], [[]], [[1.02820008]], [[0.942560...","[[[[]], [[]], [[]], [[2.22338669]], [[1.965385...","[[[[]], [[]], [[]], [[446.32]], [[145.1 256.2...",[[[[471.6]]]],"[[[[]], [[]], [[]], [[]], [[104.22]], [[60.96 ...","[[[], [], [], [[1.55071061]], [[0.91632396 1.2...","[[[], [], [], [[19.21890676]], [[19.67356072 2...","[[[], [], [], [], [[0.12885464]], [[0.23297097...","[[[], [], [], [], [[-94.50044549]], [[-126.092...",APWaveform


In [25]:
trace_df.loc[(trace_df.repetition==0) & (trace_df.stim==125)]

Unnamed: 0,repetition,nTrace,ids,stim,stim_actual,stim_ids,stim_change,vHold,stim_start,stim_end,...,AP_rise_time,AP_fall_time,time_to_AP_peak,min_cur_for_discharge,AHP_duration,AHP_fall_tau,AHP_fall_A,AHP_rise_m,AHP_rise_c,protocol_name


In [26]:
trace_df.loc[(trace_df.repetition==0)].vHold.tolist()

[array([[-65.32608302]]),
 array([[-69.4267259]]),
 array([[-71.22377162]]),
 array([[-72.37195345]]),
 array([[-72.89302931]]),
 array([[-73.28532125]])]

In [None]:
# import numpy as np
# for i in trace_df.loc[(trace_df.repetition==0)].AP_begin_voltage:
#     print(np.mean(i))

In [None]:
trace_df = protocol_df.copy()
# trace_df[trace_level_columns] = trace_df[trace_level_columns].apply(lambda x: x.)
trace_df = trace_df.explode(trace_level_columns).explode(trace_level_columns)
trace_df.head()

# Extraction of data at peak level


In [None]:
peak_columns = [
    'peak_indices', 
'peak_time' ,
'peak_voltage', 
'min_AHP_indices', 
'min_AHP_time' ,
'min_AHP_voltage', 
'AP_begin_voltage', 
'AP_begin_time' ,
'AP_amplitude' ,
'AP_half_width' ,
'AP_width' ,
'AP_rise_time', 
'AP_fall_time' ,
'time_to_AP_peak' 
]

intra_peak_columns = [
    'ISI_values' ,
'AHP_duration' ,
'AHP_rise_m' ,
'AHP_rise_c' ,
]

In [None]:
def peak_len_scale(value, max_peaks):
    if value.shape[0] == 0:
        val_list = []
    else:
        val_list = [t for t in value[0]]
    delta = max_peaks-len(val_list)
    val_list += delta*[0]
    return val_list

def intra_peak_len_scale(value, max_peaks):
    if value.shape[0] == 0:
        val_list = []
    else:
        val_list = [t for t in value[0]]    
    delta = max_peaks-len(val_list)
    val_list += delta*[0] 
    val_list += ['extra_inta_peak']
    
    return val_list

In [None]:
peak_df = trace_df.copy()
peak_df.reset_index(inplace=True, drop=True)

max_peaks = max([t[0][0] for t in peak_df.spikecount])
for col in peak_columns:
    peak_df[col] = peak_df[col].apply(lambda x: peak_len_scale(x, max_peaks))

for col in intra_peak_columns:
    peak_df[col] = peak_df[col].apply(lambda x: intra_peak_len_scale(x, max_peaks-1))
peak_df = peak_df.explode(peak_columns+intra_peak_columns)



In [None]:
peak_df.shape

# Flattening at peak level

We will now start the flattening process. At the moment, the data frame is organized in rows representing each of the peaks we observed. The new target is a row for each trace with each peak data as a column.\
For that, we will create on column per peak and feature (ex peak time of peak # 4)

In [None]:
#Creation of a unique peak id for merging/debugging purposes
peak_df_flat = peak_df.copy()
peak_df_flat['base_temp'] = 1
peak_df_flat['peak_id_temp'] = peak_df_flat['base_temp'].cumsum()
peak_df_flat['peak_id'] = peak_df_flat['repetition'].astype('str')+ '_' +peak_df_flat['ids'].astype('str') + '_'+peak_df_flat['peak_id_temp'].astype('str')

In [None]:
# Create a peak count column, so that we can order peaks

peak_df_flat.sort_values( 'peak_time')
grouping_df = peak_df_flat.sort_values( 'peak_time')[['repetition', 'peak_id', 'ids', 'base_temp']]
grouping_df['peak_rank'] = grouping_df.groupby(['repetition', 'ids']).cumsum()


In [None]:
peak_df_flat = peak_df_flat.merge(grouping_df[['peak_rank', 'peak_id']] ,how='left', on='peak_id')


In [None]:
peak_df_flat.head()

In [None]:
peak_df_flat = peak_df_flat.pivot_table(index=['repetition', 'ids'], columns='peak_rank', values= intra_peak_columns+peak_columns )

peak_df_flat.columns = [ f'{column[0]}_{column[1]}' for column in peak_df_flat.columns]
peak_df_flat.reset_index(inplace=True)

peak_df_flat.head()

In [None]:
peak_df_flat.shape

# Flatten at trace level

In [None]:
peak_df_cols = peak_df_flat.columns.tolist()

peak_feat = [f'{col}_trace' for col in peak_df_cols if col not in ['repetition', 'ids']]
trace_level_columns_pivot = [f'{col}_trace' for col in trace_level_columns if col not in ['repetition', 'ids']]




In [None]:
# Integrate back trace data

trace_keys = ['repetition', 'ids']

trace_df_flat = peak_df_flat.merge(trace_df[trace_level_columns+['repetition']], on=trace_keys, how='left')

trace_df_flat.columns = [f'{col}_trace' for col in trace_df_flat.columns]
trace_df_flat['protocol_name'] = protocol_raw[0][0][0][0]

# peak_df_flat = peak_df_flat.pivot_table(index=['repetition', 'ids'], columns='peak_rank', values= intra_peak_columns+peak_columns )

trace_df_flat = trace_df_flat.pivot_table(index= 'protocol_name', columns='repetition_trace', values= peak_feat + trace_level_columns_pivot )

trace_df_flat.columns = [ f'{column[0]}_{column[1]}' for column in trace_df_flat.columns]
trace_df_flat.reset_index(inplace=True)

trace_df_flat.head()
