In [1]:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import librosa
import librosa.display
from tqdm import tqdm

EMOTIONS = {1:"neu",2:'cal',3:'hap',4:'sad',5:'ang',6:'fea',7:'dis',8:'sur'}
EMOTION_NUM = 8
TILTLEEMOTIONS={"neu":'中性','cal':'平静','hap':'快乐','sad':'悲伤','ang':'生气','fea':'害怕','dis':'厌恶','sur':'惊讶'}
SOURCE_PATH = "db/RAVDESS/"
SAMPLE_RATE = 48000
EPOCH=10
FILE_PATH=[]
DURATION=3
OFFSET=0.5

df = pd.DataFrame(columns = ['FILE','EMOTION',])
FILE_NAME=[]
EMOTIONLIST=[]
for director, _, file_names in os.walk(SOURCE_PATH):
    for file_name in file_names:
        file_path = os.path.join(director+'/',file_name)
        FILE_NAME.append(file_path)
        EMOTIONLIST.append(file_name.split('.')[0].split('-')[2])
df["FILE"]=FILE_NAME
df["EMOTION"]=EMOTIONLIST

print("共有{}行".format(df.shape[0]))
df.head(n=10)

共有1440行


Unnamed: 0,FILE,EMOTION
0,db/RAVDESS/Actor_01/03-01-01-01-01-01-01.wav,1
1,db/RAVDESS/Actor_01/03-01-01-01-01-02-01.wav,1
2,db/RAVDESS/Actor_01/03-01-01-01-02-01-01.wav,1
3,db/RAVDESS/Actor_01/03-01-01-01-02-02-01.wav,1
4,db/RAVDESS/Actor_01/03-01-02-01-01-01-01.wav,2
5,db/RAVDESS/Actor_01/03-01-02-01-01-02-01.wav,2
6,db/RAVDESS/Actor_01/03-01-02-01-02-01-01.wav,2
7,db/RAVDESS/Actor_01/03-01-02-01-02-02-01.wav,2
8,db/RAVDESS/Actor_01/03-01-02-02-01-01-01.wav,2
9,db/RAVDESS/Actor_01/03-01-02-02-01-02-01.wav,2


In [2]:
from sklearn.model_selection import train_test_split

train_index,test_eval_index=train_test_split(list(df.index),test_size=0.2,random_state=1)
test_index,eval_index=train_test_split(test_eval_index,test_size=0.5,random_state=1)

print("train_index前10个:\t",train_index[:10])
print("eval_index前10个:\t",eval_index[:10])
print("test_index前十个:\t",test_index[:10])
print('train_index/(train_index+test_eval_index)={}/({}+{})={}'.format(len(train_index),len(train_index),len(test_eval_index),len(train_index)/(len(train_index)+len(test_eval_index))))
print('eval_index/(test_index+eval_index)={}/({}+{})={}'.format(len(eval_index),len(test_index),len(eval_index),len(eval_index)/(len(eval_index)+len(test_index))))


train_index前10个:	 [135, 368, 750, 801, 1322, 1060, 613, 994, 527, 484]
eval_index前10个:	 [1092, 495, 75, 1398, 215, 1108, 1279, 223, 735, 1053]
test_index前十个:	 [65, 1093, 925, 1170, 536, 675, 1417, 921, 1081, 1383]
train_index/(train_index+test_eval_index)=1152/(1152+288)=0.8
eval_index/(test_index+eval_index)=144/(144+144)=0.5


In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def getMELspec(data,sr):
    # shape=(n_mels, t)
    mel_spec = librosa.feature.melspectrogram(y = data,
                                              sr = SAMPLE_RATE,
                                              n_fft=1024,      # length of the FFT window
                                              win_length=1024,
                                              window='hamming',
                                              hop_length=512,
                                              n_mels=64
                                             )
    mel_spec_db = librosa.power_to_db(mel_spec,ref=np.max)
    mfccs = librosa.feature.mfcc(S=mel_spec_db)
    return mel_spec_db,mfccs

y_data=np.array(df['EMOTION'])
y_data=y_data.astype(np.int32)
y_data=y_data-1

alpha = 0.97
def PreEmphsised(data):
    return np.append(data[0],data[1:] - alpha * data[:-1])

def sameLenData(data):
    smLenData = np.zeros(SAMPLE_RATE*DURATION)
    smLenData[0:len(data)]=data
    return smLenData



In [5]:
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
import torch

scaler = StandardScaler()
DURATION=3
OFFSET=0.5
class SpeechDataset(Dataset):
    def __init__(self,index,setName=None):
        self.x_data = []
        self.arg = np.zeros([1,144000])
        self.y_data = []
        with tqdm(total=len(index)+1,desc="数据加载") as loadbar:
            for i in index: # 写入1440个文件的数据
                data, sr = librosa.load(df['FILE'][i],sr=SAMPLE_RATE,duration=DURATION,offset=OFFSET)
                smLenData = sameLenData(data)
                # 预加重
                smLenData = PreEmphsised(smLenData)
                # 特征向量
                mel_data,mfcc_data = getMELspec(smLenData,sr)
                # 差分向量
                #_,_,mel_data = get3DMel(mel_data)
                self.x_data.append(list(mel_data))
                self.y_data.append(y_data[i])
                loadbar.update(1)
#             if setName=='train_index':
#                 self.x_data=np.concatenate([self.x_data,arg_data],axis=0)
#                 self.y_data=np.concatenate([self.y_data,arg_label],axis=0)
            self.x_data=np.expand_dims(self.x_data,1)
            shape=np.array(self.x_data).shape
            self.x_data = np.reshape(self.x_data,newshape=(shape[0],-1))
            if setName=='train_index':
                self.x_data = scaler.fit_transform(self.x_data)
            else:
                self.x_data = scaler.transform(self.x_data)
            self.x_data=np.reshape(self.x_data,newshape=shape)
            self.x_data=torch.from_numpy(np.array(self.x_data,dtype=np.float64))
            self.y_data=torch.from_numpy(np.array(self.y_data,dtype=np.float64))
  
            self.len = len(index)
            loadbar.update(1)
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len


In [6]:
BATCH_SIZE=32

# 训练集加载
train_dataset = SpeechDataset(train_index,'train_index')
train_loader = DataLoader(train_dataset,
                          shuffle=True,
                          batch_size=BATCH_SIZE,
                         )
# 验证集加载
eval_dataset = SpeechDataset(eval_index)
eval_loader = DataLoader(eval_dataset,
                          shuffle=False,
                          batch_size=BATCH_SIZE,
                         )
# 测试集加载
test_dataset = SpeechDataset(test_index)
test_loader = DataLoader(test_dataset,
                          shuffle=False,
                          batch_size=BATCH_SIZE,
                         )


数据加载: 100%|████████████████████████████████████████████████████████████████████| 1153/1153 [00:48<00:00, 23.96it/s]
数据加载: 100%|██████████████████████████████████████████████████████████████████████| 145/145 [00:04<00:00, 29.59it/s]
数据加载: 100%|██████████████████████████████████████████████████████████████████████| 145/145 [00:05<00:00, 28.03it/s]


In [7]:
print(train_dataset.x_data.shape)
print(eval_dataset.x_data.shape)
print(test_dataset.x_data.shape)

torch.Size([1152, 1, 64, 282])
torch.Size([144, 1, 64, 282])
torch.Size([144, 1, 64, 282])


In [8]:
print('x_data:',train_dataset.x_data.shape,'  y_data',train_dataset.y_data.shape)
#np.set_printoptions(threshold=np.inf)
print(np.array(train_dataset.x_data[0]))
print(np.array(eval_dataset.x_data[0]))
print(np.array(test_dataset.x_data[0]))

x_data: torch.Size([1152, 1, 64, 282])   y_data torch.Size([1152])
[[[-0.67325471 -0.6821769  -0.69511429 ... -0.63889715 -0.64009395
   -0.12269867]
  [-0.62930198 -0.59434941 -0.60018943 ... -0.55454918 -0.56611444
    0.14377254]
  [-0.61070813 -0.5604981  -0.57121697 ... -0.5390266  -0.5435462
   -0.06885969]
  ...
  [-1.04204697 -1.06620386 -1.07144973 ...  0.01656576 -0.19195955
   -0.21951804]
  [-1.0418595  -1.07270438 -1.07460435 ... -0.04612548 -0.30882887
   -0.15334563]
  [-1.03902234 -1.06338046 -1.07141052 ... -0.09199534 -0.34217952
   -0.38634281]]]
[[[2.15082589 0.50382796 0.43699593 ... 3.69490294 3.83674473 3.89645083]
  [2.38720546 1.01889965 1.01030396 ... 1.4825624  1.45573554 1.25276435]
  [1.89543636 1.08976406 1.02939147 ... 0.51136153 0.52635476 0.48685327]
  ...
  [2.03571829 1.45808851 1.78926593 ... 2.24626856 2.29379796 2.1371975 ]
  [2.08007865 1.46623807 1.63826211 ... 2.16340767 2.16687631 2.19449819]
  [2.16775224 1.86209643 1.72413553 ... 2.14209925 2

In [9]:
class Inception(torch.nn.Module):
    def __init__(self, in_channels):
        super(Inception, self).__init__()
        self.block1x1 = torch.nn.Conv2d(in_channels, 16, kernel_size=1)
        self.block5x5_1 = torch.nn.Conv2d(in_channels, 16, kernel_size=1)
        self.block5x5_2 = torch.nn.Conv2d(16, 24, kernel_size=5, padding=2)
        self.block3x3_1 = torch.nn.Conv2d(in_channels, 16, kernel_size=1)
        self.block3x3_2 = torch.nn.Conv2d(16, 24, kernel_size=3, padding=1)
        self.block3x3_3 = torch.nn.Conv2d(24, 24, kernel_size=3, padding=1)
        self.block_pool = torch.nn.Conv2d(in_channels, 24, kernel_size=1)
    def forward(self, x):
        block1x1 = self.block1x1(x)
        block5x5 = self.block5x5_2(self.block5x5_1(x))
        block3x3 = self.block3x3_3(self.block3x3_2(self.block3x3_1(x)))
        block_pool = self.block_pool(F.avg_pool2d(x, kernel_size=3, stride=1, padding=1))
        outputs = [block1x1, block3x3, block5x5, block_pool]
        return torch.cat(outputs, dim=1)

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
class attention(nn.Module):
    def __init__(self,en_hidden_dim,de_hidden_dim,direction):
        super().__init__()
        middle_size = de_hidden_dim
        # H与s0拼接出来为(batch_size,src_len,en_hidden_dim*direction+de_hidden_dim)
        # 需要转化维度。首先，a(batch_size,src_len),公式2得知v(batch_size,?),E(?,src_len)
        # 因此?可以是任意维度，此处取de_hidden_dim
        self.s_en2de=nn.Linear(en_hidden_dim*2,de_hidden_dim)
        self.attn=nn.Linear(en_hidden_dim*direction+de_hidden_dim,middle_size)
        self.v = nn.Linear(middle_size,1)
        
    def forward(self,gru_output,gru_hidden):       
        # H(batch_size,src_len,en_hidden_dim*direction)
        _,src_len,_=gru_output.shape
        # s0(batch_size,en_hidden_dim*direction)
        # 但是s0作为dcoder初始隐藏状态，应该是s0(batch_size,de_hidden_dim)
        # 并且为了与H拼接，需要加一个维度，变成s0(batch_size,src_len,de_hidden_dim)
        s = torch.cat((gru_hidden[:, -2,: ], gru_hidden[:, -1,:]),dim=1)  # (batch_size,en_hidden_dim*direction)
        s = self.s_en2de(s)  # (batch_size,de_hidden_dim)
        # s=torc.tanh(s)  
        s=s.unsqueeze(1).repeat(1,src_len,1) # (batch_size,src_len,de_hidden_dim)     
        self.attn_hidden = torch.tanh(self.attn(torch.cat((s,gru_output),dim=2)))     
        attention_weight = self.v(self.attn_hidden).squeeze(2) #(batch_size,1,src_len)
        
        return F.softmax(attention_weight,dim=1)   #A dimension along which Softmax will be computed (so every slice along dim will sum to 1

In [18]:
hidden_size,input_size,p=128,16,0.3
kernel_size,stride=4,4
directional = 2

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        """
         input dimension every epoch is [32,1,128,282]
        """
        # conv2d
#         self.conv2d=nn.Sequential(# No 1
#                                    nn.Conv2d(in_channels=1,out_channels=8,kernel_size=3,stride=1,padding=1)
#                                   ,nn.BatchNorm2d(8)
#                                   ,nn.ReLU()
#                                   ,nn.MaxPool2d(kernel_size=2,stride=2)
#                                   ,nn.Dropout(p)
                                    
#                                   #,Inception(8)  #outchannel 88     35840
                                  
#                                   ,ResidualBlock(8)
            
#                                   ,nn.Conv2d(in_channels=8,out_channels=16,kernel_size=3,stride=1,padding=1)
#                                   ,nn.BatchNorm2d(128)
#                                   ,nn.ReLU()
#                                   ,nn.MaxPool2d(kernel_size=2,stride=2)
#                                   ,nn.Dropout(p)
#         )
        self.conv2d=nn.Sequential(# No 1
                                   nn.Conv2d(in_channels=1,out_channels=16,kernel_size=3,stride=1,padding=1)
                                  ,nn.BatchNorm2d(16)
                                  ,nn.ReLU()
                                  ,nn.MaxPool2d(kernel_size=2,stride=2)
                                  ,nn.Dropout(p)
            
                                  # No 2
                                  ,nn.Conv2d(in_channels=16,out_channels=32,kernel_size=3,stride=1,padding=1)
                                  ,nn.BatchNorm2d(32)
                                  ,nn.ReLU()
                                  ,nn.MaxPool2d(kernel_size=2,stride=2)
                                  ,nn.Dropout(p)
            
                                  # No 3
                                  ,nn.Conv2d(in_channels=32,out_channels=64,kernel_size=3,stride=1,padding=1)
                                  ,nn.BatchNorm2d(64)
                                  ,nn.ReLU()
                                  ,nn.MaxPool2d(kernel_size=4,stride=4)
                                  ,nn.Dropout(p)
            
                                  # No 4
                                  ,nn.Conv2d(in_channels=64,out_channels=64,kernel_size=3,stride=1,padding=1)
                                  ,nn.BatchNorm2d(64)
                                  ,nn.ReLU()
                                  ,nn.MaxPool2d(kernel_size=2,stride=2)
                                  ,nn.Dropout(p)
        )
        
        # GRU
        self.mp = nn.MaxPool2d(kernel_size=kernel_size,stride=stride)
        self.gru = nn.GRU(input_size=input_size,hidden_size=hidden_size,bidirectional =bool(directional-1),batch_first=True)
        self.dp = nn.Dropout(p)
        
        self.attention = attention(hidden_size,hidden_size,directional)
        self.emo_linear=nn.Linear(hidden_size*directional+1024,EMOTION_NUM) 
       
        self.out_dropout = nn.Dropout(0)
        
    def forward(self,x):
        x=x.float()
        
        # conv
        #convx=x[:,1].unsqueeze(1)
        convx = self.conv2d(x)
        convx = torch.flatten(convx,1) 
        #print(convx.shape)
        
        #GRU
        x = self.mp(x)   # input x:(N,C,freq,time)
        x = torch.squeeze(x)
        x = x.permute(0,2,1)  # batch_size, time, freq
        #x = torch.cat([x[:,0],x[:,1],x[:,2]],dim=2)
        
        # gru_output(batch_size, sequence length, hidden_size*directional)
        # gru_h(batch_size,n_layer*directional,hidden_size)
        # gru_h=[for_1,back_1,for_2,back_2...] 
        
        gru_output,gru_h=self.gru(x) # gru_output(B, S, H*2), gru_h(n_layer*directional,B,H)
        gru_h=gru_h.permute(1,0,2)
        gru_output = self.dp(gru_output)
        attention_weight = self.attention(gru_output,gru_h) # (batch_size,1,src_len)
        
        # (batch_size,1,src_len)*(batch_size,src_len,hidden_size*directional)
        # (batch_size,1,hidden_size*directional)
        attention_weight = attention_weight.unsqueeze(1)
#         print(attention_weight.shape)
#         print(gru_output.shape)
        attention = torch.bmm(attention_weight,gru_output)
        attention = torch.squeeze(attention,1) #（batch_size,hidden_size*directional）
        
        # cat
        output = torch.cat([convx,attention],dim=1)
        #print(attention.shape)
        #print(output.shape)
        output = self.out_dropout(self.emo_linear(output))
        return output

In [19]:
model = Model()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.01,weight_decay=1e-3,momentum=0.5)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

loss_set=[]
accuracy_train=[]
accuracy_validate=[]
def train(epoch):
    running_loss = 0.0
    total = 0
    correct = 0
    for batch_idx,(input,target) in enumerate(train_loader,0):
        input,target = input.to(device),target.to(device)
        model.train()
        optimizer.zero_grad()
        output=model(input)
        _,pre = torch.max(output.data,dim=1)
        correct += (pre==target).sum().item()
        total += target.size(0)
        target = target.long()
        loss = criterion(output,target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        loss_set.append(loss.item())
        if batch_idx % 20 == 19:
            print('[%d,%5d] loss: %.5f, running_loss: %.3f' % (epoch + 1, batch_idx + 1, running_loss / min((batch_idx+1)*BATCH_SIZE,df.shape[0]), running_loss))
            running_loss = 0
    loss_set.append(running_loss)
    accuracy_train.append(correct / total)
    print('Accacy on train_loader set: %d %% [%d/%d]' % (100 * correct / total, correct, total))   
      
#total = 0
#correct = 0
def validate(loader):
    total = 0
    correct = 0
    for (input,target) in (eval_loader if loader=='eval_loader' else test_loader):
        model.eval()
        input,target  =input.to(device),target.to(device)
        output = model(input)
        _,pre = torch.max(output.data,dim=1)
        correct += (pre==target).sum().item()
        total += target.size(0)  
    currect_rate = correct / total
    print('Accacy on %s set: %d %% [%d/%d]' % (loader,100 * correct / total, correct, total)) 
    accuracy_validate.append(currect_rate)
    return currect_rate

cuda:0


In [20]:
model_path='model\\model4.pt' 
opti_path='model\\opti4.pt' 
loss_path='model\\loss4.txt'
accur_train_path='model\\accur_train4.txt'
accur_valid_path='model\\accur_valid4.txt'
def saveModel(model_path):
    torch.save(model.state_dict(),model_path)
    torch.save(optimizer.state_dict(),opti_path) 
#     print("Model's state_dict:")
#     for param_tensor in model.state_dict():
#         print(param_tensor, "\t", model.state_dict()[param_tensor].size())
        
def loadModel(model_path):
    model = Model()
    model.to(device)
    model.load_state_dict(torch.load(model_path))
    return model

def saveData(loss,accuracy_train,accuracy_validate):
    np.savetxt(loss_path,np.array(loss),fmt='%0.8f')
    np.savetxt(accur_train_path,np.array(accuracy_train),fmt='%0.8f')
    np.savetxt(accur_valid_path,np.array(accuracy_validate),fmt='%0.8f')

In [None]:
if __name__ == '__main__':
    max_correct_rate=0.0
    for epoch in range(1500):
        train(epoch)
        correct_rate=validate('eval_loader')
        if max_correct_rate < correct_rate:
            max_correct_rate = correct_rate
            saveModel(model_path)
        

[1,   20] loss: 0.07471, running_loss: 47.812
Accacy on train_loader set: 21 % [252/1152]
Accacy on eval_loader set: 16 % [24/144]
[2,   20] loss: 0.05883, running_loss: 37.650
Accacy on train_loader set: 31 % [365/1152]
Accacy on eval_loader set: 15 % [22/144]
[3,   20] loss: 0.05535, running_loss: 35.426
Accacy on train_loader set: 35 % [414/1152]
Accacy on eval_loader set: 15 % [23/144]
[4,   20] loss: 0.05072, running_loss: 32.458
Accacy on train_loader set: 39 % [450/1152]
Accacy on eval_loader set: 15 % [22/144]
[5,   20] loss: 0.04798, running_loss: 30.707
Accacy on train_loader set: 40 % [468/1152]
Accacy on eval_loader set: 15 % [23/144]
[6,   20] loss: 0.04931, running_loss: 31.556
Accacy on train_loader set: 42 % [492/1152]
Accacy on eval_loader set: 18 % [27/144]
[7,   20] loss: 0.04668, running_loss: 29.877
Accacy on train_loader set: 44 % [513/1152]
Accacy on eval_loader set: 18 % [26/144]
[8,   20] loss: 0.04550, running_loss: 29.122
Accacy on train_loader set: 45 % [522

In [None]:
max_correct_rate

In [None]:
if __name__ == '__main__':
    
    for epoch in range(3500):
        train(epoch)
        correct_rate=validate('eval_loader')
        if max_correct_rate < correct_rate:
            max_correct_rate = correct_rate
            saveModel(model_path)

In [None]:
max_correct_rate

In [None]:
if __name__ == '__main__':
    model=loadModel(model_path)
    validate('test_loader')