In [1]:
import os
from pathlib import Path
import h5py
import numpy as np
import pandas as pd

In [2]:
base_path = os.path.dirname(os.getcwd())
base_path = Path(base_path)

In [3]:
df = pd.read_excel(f'{base_path}/confinement_database.xlsx')
df.head()

Unnamed: 0,shot,tstart (ms),tstop (ms),L-mode,H-mode,QH-mode,WP QH-mode,Notes
0,149992.0,2540.0,2635.0,1.0,,,,
1,149992.0,2638.0,3200.0,,1.0,,,"ELM-free, then ELMy"
2,149992.0,4038.0,4125.0,1.0,,,,
3,149992.0,4136.0,4500.0,,1.0,,,"ELM-free, then ELMy"
4,149993.0,1100.0,1900.0,1.0,,,,long L-mode due to failed LH transition


In [4]:
df.fillna(value=0, inplace=True)
df.head(10)

Unnamed: 0,shot,tstart (ms),tstop (ms),L-mode,H-mode,QH-mode,WP QH-mode,Notes
0,149992.0,2540.0,2635.0,1.0,0.0,0.0,0.0,0
1,149992.0,2638.0,3200.0,0.0,1.0,0.0,0.0,"ELM-free, then ELMy"
2,149992.0,4038.0,4125.0,1.0,0.0,0.0,0.0,0
3,149992.0,4136.0,4500.0,0.0,1.0,0.0,0.0,"ELM-free, then ELMy"
4,149993.0,1100.0,1900.0,1.0,0.0,0.0,0.0,long L-mode due to failed LH transition
5,149993.0,2540.0,2635.0,1.0,0.0,0.0,0.0,0
6,149993.0,2650.0,3400.0,0.0,1.0,0.0,0.0,"ELM-free, then ELMy"
7,149993.0,4050.0,4165.0,1.0,0.0,0.0,0.0,0
8,149993.0,4172.0,4975.0,0.0,1.0,0.0,0.0,"ELM-free, then ELMy"
9,149994.0,1340.0,1900.0,1.0,0.0,0.0,0.0,long L-mode due to failed LH transition


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   shot         41 non-null     float64
 1   tstart (ms)  41 non-null     float64
 2   tstop (ms)   41 non-null     float64
 3   L-mode       41 non-null     float64
 4   H-mode       41 non-null     float64
 5   QH-mode      41 non-null     float64
 6   WP QH-mode   41 non-null     float64
 7   Notes        41 non-null     object 
dtypes: float64(7), object(1)
memory usage: 2.7+ KB


## A few things to confirm:
- Are `tstart` and `tstop` just the indexes for both time and signals or should I shift them by 800?
- Does it make sense to multiply `tstart` and `tstop` by 1000 to retrieve corresponding signals?

In [6]:
import re
shot_dict = {}
data = pd.DataFrame()
for f in os.listdir(f'{base_path}/data'):
    print(f'Filename: {f}')
    data_dict = {}
    try:
        shot_data = h5py.File(f'{base_path}/data/{f}')
        shot_num = re.findall(r'\d+', f)[0]
        print(f'Shot number: {shot_num}')
        signals = np.array(shot_data['signals']).transpose()
        time = (np.array(shot_data['time']) * 1000).astype(int)
        print(f'Signals: {signals.shape}')
        print(f'Time: {time.shape}')
        shot_df = df[df['shot'] == float(shot_num)]
        label = []
        signal = []
        for i, row in shot_df.iterrows():
            tstart = int(row['tstart (ms)'])
            tstop = int(row['tstop (ms)'])
            tmode_signal = signals[tstart:tstop]
            print(tmode_signal.shape)
            signal_df = pd.DataFrame(
                {f'Ch_{j+1}':tmode_signal[:,j] for j in range(tmode_signal.shape[1])},
                columns=[f'Ch_{j+1}' for j in range(tmode_signal.shape[1])]
            )
            label_vec = row[['L-mode', 'H-mode', 'QH-mode', 'WP QH-mode']].to_numpy(dtype=int)
            signal_df['mode'] = np.argmax(label_vec)
            signal.append(signal_df)
        # signal_df['mode'] = label
        signal = pd.concat(signal, axis=0)
        print('Target value counts:')
        print(signal['mode'].value_counts())
    except FileNotFoundError:
        'HDF5 file not found.'


Filename: bes_signals_149993.hdf5
Shot number: 149993
Signals: (5242880, 64)
Time: (5242880,)
(800, 64)
(95, 64)
(750, 64)
(115, 64)
(803, 64)
Target value counts:
1    1553
0    1010
Name: mode, dtype: int64
Filename: bes_signals_149992.hdf5
Shot number: 149992
Signals: (5242880, 64)
Time: (5242880,)
(95, 64)
(562, 64)
(87, 64)
(364, 64)
Target value counts:
1    926
0    182
Name: mode, dtype: int64
