导入库

In [9]:
import os
import imp
import numpy as np

import utils
imp.reload(utils)

<module 'utils' from '/data1/byzhao/EEG-AudioTransformer/utils.py'>

公共变量

In [10]:
words_path = r'./feat/words'
pt = 'sub-06'
test_word = 10
config_path = r'./config'
model_name = 'h4l6p3f40'

读取数据

In [11]:
folder_path = os.path.join(words_path,f'{pt}')
filename = os.listdir(folder_path)[test_word]
word_info = np.load(os.path.join(folder_path,filename),allow_pickle=True)
word=word_info.item()['label']
eeg=word_info.item()['eeg']
audio=word_info.item()['audio']

print(eeg.shape,audio.shape)

(1290, 127) (20160,)


数据预处理

In [12]:
window_length = 0.025
frameshift = 0.005
eeg_sample_rate = 1024
audio_sameple_rate = 16000

提取高频eeg信号和音频信号的梅尔频谱

In [13]:
eeg = utils.extractHG(eeg,eeg_sample_rate,windowLength=window_length,frameshift=frameshift)
melspec = utils.extractMelSpecs(audio,audio_sameple_rate,windowLength=window_length,frameshift=frameshift)
print(eeg.shape,melspec.shape)
if melspec.shape[0]!=eeg.shape[0]:
    minlen = min(melspec.shape[0],eeg.shape[0])
    melspec = melspec[:minlen,:]
    eeg = eeg[:minlen,:]
print(eeg.shape,melspec.shape)

(246, 127) (247, 40)
(246, 127) (246, 40)


z均值处理

In [14]:
eeg_mean = np.mean(eeg)
eeg_std = np.std(eeg)
eeg = (eeg-eeg_mean)/eeg_std

加载模型

In [15]:
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

from torch.utils.data import TensorDataset, DataLoader

import transformer
imp.reload(transformer)
import json

In [16]:
with open(os.path.join(config_path,f'{model_name}.json'),'r') as f:
    cfg = json.load(f)['model_config']

prv_frame = cfg['prv_frame']
batch_size = cfg['batch_size']
epochs = cfg['epochs']
lr = cfg['lr']
b1 = cfg['b1']
b2 = cfg['b2']
scaled_dim = cfg['scaled_dim']
d_model = cfg['d_model']
nhead = cfg['nhead']
n_layer = cfg['n_layer']
input_dim = eeg.shape[-1]
output_dim = melspec.shape[-1]

tensor_type = torch.cuda.FloatTensor

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
# log_write = open(f"./log/log_{pt}.txt", "w") 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = transformer.Model(
    input_dim=input_dim,
    output_dim=output_dim,
    scaled_dim=scaled_dim,
    prv_dim=prv_frame,
    d_model=d_model,
    nhead=nhead,
    n_layer=n_layer
).to(device)

# criterion = nn.MSELoss(reduction='mean').to(device)
criterion = nn.L1Loss(reduction='mean').to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=lr,betas=(b1,b2))

print(input_dim,output_dim)

127 40


输入预处理

In [17]:
data_padding = np.zeros((1,eeg.shape[1]))
eeg_list = []
for idx in range(eeg.shape[0]):
    if idx-prv_frame+1<0:
        tmp = eeg[0:idx+1]
        for _ in range(prv_frame-idx-1):
            tmp=np.insert(tmp,0,data_padding,axis=0)
        eeg_list.append(tmp)
    else:
        eeg_list.append(eeg[idx-prv_frame+1:idx+1])
eeg = np.stack(eeg_list,axis=0)

In [18]:
eeg.shape

(246, 3, 127)

In [19]:
# pbar = tqdm.trange(epochs, desc=f"Epochs")
model.load_state_dict(torch.load(f'./res/{pt}/{model_name}.pt')['model_state_dict'])
model.eval()

Model(
  (l1): Sequential(
    (0): Linear(in_features=127, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=512, out_features=256, bias=True)
  )
  (transformer): TransformerModel(
    (encoder_layer): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
      )
      (linear1): Linear(in_features=256, out_features=1024, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=1024, out_features=256, bias=True)
      (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (transformer): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): Multihe

转换为MFCC

In [20]:
model_output = model(torch.from_numpy(eeg).to(device).type(tensor_type)).detach().cpu().numpy()

In [21]:
import matplotlib.pyplot as plt
import librosa
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(f'./logs/{pt}/{model_name}')
origin_melspec_fig = plt.figure()
librosa.display.specshow(melspec.T,sr=16000,hop_length=80,win_length=400,x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')        
plt.title(f'{pt}-{word}-origin')
writer.add_figure(tag=f"{pt}-{word}-origin log Mel spectrogram",figure=origin_melspec_fig)

model_melspec_fig = plt.figure()
librosa.display.specshow(model_output.T,sr=16000,hop_length=80,win_length=400,x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')        
plt.title(f'{pt}-{word}-model')
writer.add_figure(tag=f"{pt}-{word}-model log Mel spectrogram",figure=model_melspec_fig)
plt.show()
# librosa_melspec_fig = plt.figure()
# # numWindows = int(np.floor((audio.shape[0]-window_length*audio_sameple_rate)/(frameshift*audio_sameple_rate)))
# librosa_melspec = librosa.feature.melspectrogram(y=audio.astype(np.float32),sr=audio_sameple_rate,n_fft=400,hop_length=80,n_mels=80,center=False)
# librosa_melspec = librosa.power_to_db(librosa_melspec, ref=np.max)
# librosa.display.specshow(librosa_melspec,sr=16000,hop_length=80,win_length=400,x_axis='time', y_axis='mel')
# plt.colorbar(format='%+2.0f dB')        
# plt.title(f'{pt}-{word}-librosa')
# plt.show()
# # print(numWindows)
# writer.add_figure(tag=f"{pt}-{word}-librosa log Mel spectrogram",figure=librosa_melspec_fig)
writer.close()

In [22]:
model_mfcc = utils.toMFCC(model_output)
mfcc = utils.toMFCC(melspec)
eu_dis = 0
for i in range(mfcc.shape[0]):
    eu_dis += np.linalg.norm(model_mfcc[i] - mfcc[i])
mcd = eu_dis/mfcc.shape[0]
print(model_output.shape,model_mfcc.shape)
print(melspec.shape,mfcc.shape)
print(mcd)

(246, 40) (246, 13)
(246, 40) (246, 13)
1.623258642696099


In [23]:
print(melspec.shape)
print(model_output.shape)

(246, 40)
(246, 40)
