In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import librosa
import librosa.display

EMOTIONS = {1:"neu",2:'cal',3:'hap',4:'sad',5:'ang',6:'fea',7:'dis',8:'sur'}
EMOTION_NUM = 8
TILTLEEMOTIONS={"neu":'中性','cal':'平静','hap':'快乐','sad':'悲伤','ang':'生气','fea':'害怕','dis':'厌恶','sur':'惊讶'}
SOURCE_PATH = "db/RAVDESS/"
SAMPLE_RATE = 48000
EPOCH=10
FILE_PATH=[]


In [None]:
df = pd.DataFrame(columns = ['FILE','EMOTION',])
FILE_NAME=[]
EMOTIONLIST=[]
for director, _, file_names in os.walk(SOURCE_PATH):
    for file_name in file_names:
        file_path = os.path.join(director+'/',file_name)
        FILE_NAME.append(file_path)
        EMOTIONLIST.append(file_name.split('.')[0].split('-')[2])
df["FILE"]=FILE_NAME
df["EMOTION"]=EMOTIONLIST

print("共有{}行".format(df.shape[0]))
df.head(n=10)

In [None]:
from sklearn.model_selection import train_test_split

train_eval_index,test_index=train_test_split(list(df.index),test_size=0.3,random_state=1)
train_index,eval_index=train_test_split(train_eval_index,test_size=0.125,random_state=1)

print("train_index前10个:\t",train_index[:10])
print("eval_index前10个:\t",eval_index[:10])
print("test_index前十个:\t",test_index[:10])
print('test_index/(train_index+test_index)={}/({}+{})={}'.format(len(test_index),len(train_eval_index),len(test_index),len(test_index)/(len(test_index)+len(train_eval_index))))
print('eval_index/(train_index+eval_index)={}/({}+{})={}'.format(len(eval_index),len(train_index),len(eval_index),len(eval_index)/(len(eval_index)+len(train_index))))


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def getMELspec(data,sr):
    # shape=(n_mels, t)
    mel_spec = librosa.feature.melspectrogram(y = data,
                                              sr = SAMPLE_RATE,
                                              n_fft=1024,      # length of the FFT window
                                              win_length=512,
                                              window='hamming',
                                              hop_length=256,
                                              n_mels=128
                                             )
    mel_spec_db = librosa.power_to_db(mel_spec,ref=np.max)
    return mel_spec_db

def labelEncoder():   
    encoder = OneHotEncoder()
    labels = encoder.fit_transform(np.array(df['EMOTION']).reshape(-1, 1)).toarray()
    # np.set_printoptions(threshold=np.inf)
    return labels

y_data = labelEncoder() 
print(np.array(y_data).shape)

In [None]:
alpha = 0.97
def PreEmphsised(data):
    return np.append(data[0],data[1:] - alpha * data[:-1])

In [None]:
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import torch

DURATION=3
OFFSET=0.5
class SpeechDataset(Dataset):
    def __init__(self,index):
        self.scaler = StandardScaler()
        self.x_data = []
        self.y_data = []
        with tqdm(total=len(index),desc="数据加载") as loadbar:
            for i in index: # 写入1440个文件的数据
                data, sr = librosa.load(df['FILE'][i],sr=SAMPLE_RATE,duration=DURATION,offset=OFFSET)
                smLenData = np.zeros(SAMPLE_RATE*DURATION)
                smLenData[0:len(data)]=data
                # 预加重
                smLenData = PreEmphsised(smLenData)
                # 特征向量
                mel_data = getMELspec(smLenData,sr)
                self.x_data.append(list(mel_data))
                self.y_data.append(y_data[i])
                loadbar.update(1)
        shape=np.array(self.x_data).shape
        self.x_data = np.reshape(self.x_data,newshape=(shape[0],-1))
        self.x_data = self.scaler.fit_transform(self.x_data)
        self.x_data=np.reshape(self.x_data,newshape=shape)
        
        self.x_data=torch.from_numpy(np.array(self.x_data,dtype=np.float64))
        self.y_data=torch.from_numpy(np.array(self.y_data,dtype=np.float64))
  
        self.len = len(index)
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len


In [None]:
BATCH_SIZE=64

# 训练集加载
train_dataset = SpeechDataset(train_index)
train_loader = DataLoader(train_dataset,
                          shuffle=True,
                          batch_size=BATCH_SIZE,
                          num_workers=2
                         )
# 验证集加载
eval_dataset = SpeechDataset(eval_index)
eval_loader = DataLoader(eval_dataset,
                          shuffle=False,
                          batch_size=BATCH_SIZE,
                          num_workers=2
                         )
# 测试集加载
test_dataset = SpeechDataset(test_index)
test_loader = DataLoader(test_dataset,
                          shuffle=False,
                          batch_size=BATCH_SIZE,
                          num_workers=2
                         )


In [None]:
import torch
import torch.nn as nn
class attention(nn.Module):
    def __init__(self,en_hidden_dim,de_hidden_dim,direction):
        super().__init__()
        middle_size = de_hidden_dim
        # H与s0拼接出来为(batch_size,src_len,en_hidden_dim*direction+de_hidden_dim)
        # 需要转化维度。首先，a(batch_size,src_len),公式2得知v(batch_size,?),E(?,src_len)
        # 因此?可以是任意维度，此处取de_hidden_dim
        self.s_en2de=nn.Linear(en_hidden_dim*2,de_hidden_dim)
        
        self.attn=nn.Linear(en_hidden_dim*direction+de_hidden_dim,middle_size)
        
        self.v = nn.Linear(middle_size,1)
        
    def forward(self,gru_output,gru_hidden):       
        # H(batch_size,src_len,en_hidden_dim*direction)
        _,src_len,_=gru_output.shape
        # s0(batch_size,en_hidden_dim*direction)
        # 但是s0作为dcoder初始隐藏状态，应该是s0(batch_size,de_hidden_dim)
        # 并且为了与H拼接，需要加一个维度，变成s0(batch_size,src_len,de_hidden_dim)
        s = torch.cat((gru_hidden[:,-2:,],gru_hidden[:,-1:]),dim=1)  # (batch_size,en_hidden_dim*direction)
        s = self.s_en2de(s)  # (batch_size,de_hidden_dim)
        # s=torc.tanh(s)
        
        s=s.unqueeze(1).repeat(1,src_len,1) # (batch_size,src_len,de_hidden_dim)
        
        self.attn_hidden = torch.tanh(self.attn(torch.cat((s,gru_output),dim=2)))
        
        attenion_weight = self.v(self.attn_hidden) #(batch_size,1,src_len)
        
        return nn.softmax(attention_weight,dim=2)   #A dimension along which Softmax will be computed (so every slice along dim will sum to 1