In [10]:
import numpy as np
import librosa

In [23]:
# EMG Data Shape Info: Shape info: (10319,41)
""" The EMG Data was measured at 2048 Hz, therefore with an estimated video length of 5 seconds, we would see roughly 10240 samples.
    Therefore, we can safely assume that the data presented is the EMG Signals collected from 40 channels every 0.00048828125 seconds and an additional channel
    pull high only at the start of each utterance
"""
emg_data = np.load('Spk1_Block1-Initial_0001_emg.npy')

audio_file = 'Spk1_Block1-Initial_0001_audio.wav'
sr = 16000
audio_data, sr = librosa.load(audio_file, sr=sr)

emg_signals = emg_data[:-1, :]
# This one signifies when a user speaks and doesn't
marker_channel = emg_data[-1, :]

window_size = 100
hop_length = 50

audio_hop_length = sr // 100
n_mfcc =13

audio_mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=n_mfcc, hop_length=audio_hop_length)

In [26]:
def extract_emg_features(emg_signal):
  # Mean absolute value
  mav = np.mean(np.abs(emg_signal))
  # root mean square\
  rms = np.sqrt(np.mean(emg_signal ** 2))
  return mav, rms

emg_features = []

# Process each channel
for ch in range(emg_signals.shape[1]):
  channel_features = []
  for i in range(0, emg_signals.shape[0] - window_size, hop_length):
    window = emg_signals[i:i + window_size, ch]
    mav, rms = extract_emg_features(window)
    channel_features.append([mav, rms])
  emg_features.append(channel_features)

emg_features = np.array(emg_features)
print(audio_mfccs.shape)

(13, 505)
