In [1]:
import os
from pathlib import Path
import h5py
import numpy as np
import pandas as pd

In [2]:
base_path = os.path.dirname(os.getcwd())
base_path = Path(base_path)

In [3]:
df = pd.read_excel(f'{base_path}/confinement_database.xlsx')
df.head()

Unnamed: 0,shot,tstart (ms),tstop (ms),L-mode,H-mode,QH-mode,WP QH-mode,Notes
0,149992.0,2540.0,2635.0,1.0,,,,
1,149992.0,2638.0,3200.0,,1.0,,,"ELM-free, then ELMy"
2,149992.0,4038.0,4125.0,1.0,,,,
3,149992.0,4136.0,4500.0,,1.0,,,"ELM-free, then ELMy"
4,149993.0,1100.0,1900.0,1.0,,,,long L-mode due to failed LH transition


In [4]:
df.fillna(value=0, inplace=True)
df.head(10)

Unnamed: 0,shot,tstart (ms),tstop (ms),L-mode,H-mode,QH-mode,WP QH-mode,Notes
0,149992.0,2540.0,2635.0,1.0,0.0,0.0,0.0,0
1,149992.0,2638.0,3200.0,0.0,1.0,0.0,0.0,"ELM-free, then ELMy"
2,149992.0,4038.0,4125.0,1.0,0.0,0.0,0.0,0
3,149992.0,4136.0,4500.0,0.0,1.0,0.0,0.0,"ELM-free, then ELMy"
4,149993.0,1100.0,1900.0,1.0,0.0,0.0,0.0,long L-mode due to failed LH transition
5,149993.0,2540.0,2635.0,1.0,0.0,0.0,0.0,0
6,149993.0,2650.0,3400.0,0.0,1.0,0.0,0.0,"ELM-free, then ELMy"
7,149993.0,4050.0,4165.0,1.0,0.0,0.0,0.0,0
8,149993.0,4172.0,4975.0,0.0,1.0,0.0,0.0,"ELM-free, then ELMy"
9,149994.0,1340.0,1900.0,1.0,0.0,0.0,0.0,long L-mode due to failed LH transition


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   shot         41 non-null     float64
 1   tstart (ms)  41 non-null     float64
 2   tstop (ms)   41 non-null     float64
 3   L-mode       41 non-null     float64
 4   H-mode       41 non-null     float64
 5   QH-mode      41 non-null     float64
 6   WP QH-mode   41 non-null     float64
 7   Notes        41 non-null     object 
dtypes: float64(7), object(1)
memory usage: 2.7+ KB


## A few things to confirm:
- Are `tstart` and `tstop` just the indexes for both time and signals or should I shift them by 800?
- Does it make sense to multiply `tstart` and `tstop` by 1000 to retrieve corresponding signals?

In [8]:
import re
data = []
for f in os.listdir(f'{base_path}/data'):
    print(f'Filename: {f}')
    try:
        shot_data = h5py.File(f'{base_path}/data/{f}')
        shot_num = re.findall(r'\d+', f)[0]
        print(f'Shot number: {shot_num}')
        signals = np.array(shot_data['signals']).transpose()
        time = (np.array(shot_data['time']) * 1000).astype(int)
        print(f'Signals: {signals.shape}')
        print(f'Time: {time.shape}')
        shot_df = df[df['shot'] == float(shot_num)]
        label = []
        signal = []
        for i, row in shot_df.iterrows():
            tstart = int(row['tstart (ms)'])
            tstop = int(row['tstop (ms)'])
            tmode_signal = signals[tstart:tstop]
            signal_df = pd.DataFrame(
                {f'Ch_{j+1}':tmode_signal[:,j] for j in range(tmode_signal.shape[1])},
                columns=[f'Ch_{j+1}' for j in range(tmode_signal.shape[1])]
            )
            label_vec = row[['L-mode', 'H-mode', 'QH-mode', 'WP QH-mode']].to_numpy(dtype=int)
            signal_df['mode'] = np.argmax(label_vec)
            signal.append(signal_df)
        # signal_df['mode'] = label
        signal = pd.concat(signal, axis=0)
        signal['shot_num'] = shot_num
        print('Target value counts:')
        print(signal['mode'].value_counts())
        data.append(signal)
    except FileNotFoundError:
        'HDF5 file not found.'
data = pd.concat(data, axis=0)
print(data.info())
data.head()

Filename: bes_signals_149993.hdf5
Shot number: 149993
Signals: (5242880, 64)
Time: (5242880,)
Target value counts:
1    1553
0    1010
Name: mode, dtype: int64
Filename: bes_signals_149992.hdf5
Shot number: 149992
Signals: (5242880, 64)
Time: (5242880,)
Target value counts:
1    926
0    182
Name: mode, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3671 entries, 0 to 363
Data columns (total 66 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Ch_1      3671 non-null   float64
 1   Ch_2      3671 non-null   float64
 2   Ch_3      3671 non-null   float64
 3   Ch_4      3671 non-null   float64
 4   Ch_5      3671 non-null   float64
 5   Ch_6      3671 non-null   float64
 6   Ch_7      3671 non-null   float64
 7   Ch_8      3671 non-null   float64
 8   Ch_9      3671 non-null   float64
 9   Ch_10     3671 non-null   float64
 10  Ch_11     3671 non-null   float64
 11  Ch_12     3671 non-null   float64
 12  Ch_13     3671 non-null  

Unnamed: 0,Ch_1,Ch_2,Ch_3,Ch_4,Ch_5,Ch_6,Ch_7,Ch_8,Ch_9,Ch_10,...,Ch_57,Ch_58,Ch_59,Ch_60,Ch_61,Ch_62,Ch_63,Ch_64,mode,shot_num
0,-0.077213,0.101327,0.055631,-0.159374,-0.069322,-0.187343,-0.089867,-0.065956,-0.023791,0.043367,...,-0.009869,-0.039719,-0.003681,0.012176,0.04621,0.073799,0.034393,0.00965,0,149993
1,-0.022061,-0.009767,-0.106318,0.016186,0.050416,0.067345,-0.024621,-0.059734,0.082643,0.040889,...,0.008635,0.026065,0.031899,0.069404,-0.017025,0.050429,0.046676,0.060313,0,149993
2,0.154426,0.128185,-0.016071,-0.059765,0.107134,0.052652,-0.084942,-0.072178,-0.032556,-0.050801,...,0.007402,0.021101,0.002454,-0.0207,-0.036481,-0.00492,0.097037,0.027744,0,149993
3,-0.009805,-0.026858,-0.042033,-0.028638,-0.06302,-0.086937,-0.124336,-0.064711,-0.01753,-0.01363,...,-0.018504,-0.008688,0.009815,0.003653,0.029185,0.03567,0.060188,0.054282,0,149993
4,0.045347,0.091561,-0.063049,0.017432,0.066801,-0.053876,-0.025852,-0.083378,0.12146,-0.009912,...,0.004934,0.011171,0.018403,0.035311,-0.014593,0.03444,0.034393,0.056695,0,149993
