In [1]:
import numpy as np
import pandas as pd
import os
import time
import gc
import random
from tqdm import tqdm_notebook
import torch
import librosa
from torch import nn
from torch.utils import data
from torch.nn import functional as F
from sklearn.model_selection import KFold
from scipy.stats import entropy
from scipy.signal import butter, lfilter, freqz
import matplotlib.pyplot as plt

In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(42)

In [41]:
n_epoch = 70
batch_size = 8192
n_frame = 242

window = int(n_frame*1.1)
step = int(n_frame/5)

In [4]:
def check_data(y,threshold):
    if(np.max(y)-np.min(y)<threshold):
        return False
    return True

In [5]:
def butter_lowpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return b, a


def butter_lowpass_filter(data, cutoff, fs, order=5):
    b, a = butter_lowpass(cutoff, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [6]:
def normalize(y):
    return -1 + 2*(y-np.min(y)/(np.max(y)-np.min(y)))

In [7]:
train_X = []
train_y = []
val_X = []
val_y = []

for idx,drt in tqdm_notebook(enumerate(os.listdir('./datasets/voco'))):
    
    for file in os.listdir('./datasets/voco/'+drt):
        if 'EGG' in file and ('wav' in file or 'WAV' in file):
            try:
                x,sr = librosa.load('./datasets/voco/'+drt+'/'+file,sr=48000,mono=False)
                x = librosa.resample(x, sr, 16000)
                itvs = librosa.effects.split(x[0],frame_length = 1024, hop_length = 512)

                for st_idx,end_idx in itvs:
                    speech,egg = x[0][st_idx:end_idx],x[1][st_idx:end_idx]
                    speech = butter_lowpass_filter(speech,2500,16000)
                    i=0
                    while(i*step+window < len(speech)):
                        tmp_speech = speech[i*step:i*step+window]
                        tmp_egg = egg[i*step:i*step+window]
                        if check_data(tmp_egg,0.5):
                            tmp_egg = normalize(tmp_egg)
                            if idx<50:
                                train_X.append(tmp_speech)
                                train_y.append(tmp_egg)
                            else:
                                val_X.append(tmp_speech)
                                val_y.append(tmp_egg)
                        i+=1
            except:
                print('nop')
                continue


for drt in ['./datasets/cmu_us_bdl_arctic/orig/','./datasets/cmu_us_jmk_arctic/orig/','./datasets/cmu_us_slt_arctic/orig/']:
    for file in tqdm_notebook(os.listdir(drt)):
        x,sr = librosa.load(drt+file,sr=16000,mono=False)
        itvs = librosa.effects.split(x[0],frame_length = 1024, hop_length = 512)
        
        for st_idx,end_idx in itvs:
            speech,egg = x[0][st_idx:end_idx],x[1][st_idx:end_idx]
            speech = butter_lowpass_filter(speech,2500,16000)
            i=0
            while(i*step+window < len(speech)):
                tmp_speech = speech[i*step:i*step+window]
                tmp_egg = egg[i*step:i*step+window]
                if check_data(tmp_egg,0.5):
                    tmp_egg = normalize(tmp_egg)
                    if file[7]=='a':
                        train_X.append(tmp_speech)
                        train_y.append(tmp_egg)                   
                    if file[7]=='b':    
                        val_X.append(tmp_speech)
                        val_y.append(tmp_egg)
                i+=1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

nop



HBox(children=(IntProgress(value=0, max=1131), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1114), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1132), HTML(value='')))




In [28]:
train_X = np.array(train_X).astype('float')
train_y = np.array(train_y).astype('float')
val_X = np.array(val_X).astype('float')
val_y = np.array(val_y).astype('float')

In [29]:
print(len(train_X))
print(len(val_X))

637420
500234


In [31]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.rnn = nn.LSTM(input_dim, hid_dim, num_layers = n_layers, dropout = dropout,bidirectional = False,batch_first=True)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [bs, n_frame, 1]

        embedded = self.dropout(x)
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [ bs,n_frame,  hid dim * n directions]
        #hidden = [ n layers * n directions,bs, hid dim]
        #cell = [ n layers * n directions,bs, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

In [32]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.rnn = nn.LSTM(output_dim, hid_dim,num_layers = n_layers, dropout = dropout,bidirectional = False,batch_first=True)
        
        self.out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [bs,1]
        #hidden = [n layers * n directions,bs, hid dim]
        #cell = [n layers * n directions,bs, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [ n layers,bs, hid dim]
        #context = [ n layers,bs, hid dim]

        input = input.unsqueeze(1)
        #input = [bs,1,1]
        
        embedded = self.dropout(input)
        
        #embedded = [bs,1, 1]
        
        
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        #output = [batch size, n_frame, hid dim * n directions]
        #hidden = [batch size,n layers * n directions,  hid dim]
        #cell = [batch size,n layers * n directions,  hid dim]
        
        #sent len and n directions will always be 1 in the decoder, therefore:
        #output = [batch size,1,  hid dim]
        #hidden = [batch size,n layers,  hid dim]
        #cell = [batch size, n layers,  hid dim]
        
        prediction = self.out(output.squeeze(1))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [33]:
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    def forward(self,x,y,teacher_forcing_ratio=0.5):
        #x = [bs,n_frame,1]
        #y = [bs,n_frame,1]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = y.shape[0]
        n_frame = y.shape[1]
        
        #tensor to store decoder outputs
        outputs = torch.zeros(batch_size,n_frame,1).cuda()
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(x)
        
        #first input to the decoder is the target[0,""] (i.e [bs, 1])
        input = torch.Tensor(np.array([-1]*batch_size)).unsqueeze(1).cuda()
        
        for t in range(n_frame):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            #place predictions in a tensor holding predictions for each token
            outputs[:,t,:] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = y[:,t,:] if teacher_force else output
        
        return outputs

In [34]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self,X,y,n_frame,is_train):
        self.X = X
        self.y = y
        self.n_frame = n_frame
        self.is_train = is_train
    def __len__(self):
        return len(self.X)
    def __getitem__(self,idx):
        if self.is_train:
            pi = random.randint(0,len(self.X[idx])-self.n_frame)        
            _x,_y = self.X[idx][pi:pi+self.n_frame],self.y[idx][pi:pi+self.n_frame]
        else:
            _x,_y = self.X[idx],self.y[idx]
        return np.expand_dims(_x,axis=-1),np.expand_dims(_y,axis=-1)

In [35]:
train_dataset = Dataset(train_X,train_y,n_frame = n_frame, is_train = True)
valid_dataset = Dataset(val_X,val_y,n_frame = n_frame, is_train = False)
train_loader = data.DataLoader(dataset=train_dataset,
                               batch_size=batch_size,
                               num_workers=2,
                               shuffle=True)
valid_loader = data.DataLoader(dataset=valid_dataset,
                               batch_size=batch_size,
                               num_workers=2,
                              shuffle=False)

In [36]:
HID_DIM = 8
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

encoder = Encoder(input_dim = 1, hid_dim = HID_DIM,n_layers=N_LAYERS, dropout =ENC_DROPOUT)
decoder = Decoder(output_dim = 1, hid_dim = HID_DIM, n_layers=N_LAYERS,dropout =DEC_DROPOUT)
model = Seq2Seq(encoder,decoder)
model.cuda()
criterion = nn.MSELoss()
criterion.cuda()

MSELoss()

In [37]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (rnn): LSTM(1, 8, num_layers=2, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (rnn): LSTM(1, 8, num_layers=2, batch_first=True, dropout=0.5)
    (out): Linear(in_features=8, out_features=1, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [38]:
lr = 3e-4
param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
optimizer = torch.optim.Adam(param_lrs, lr=lr)

In [39]:
torch.backends.cudnn.enabled = False

In [40]:
for epoch in range(n_epoch):
    avg_loss = 0.
    avg_entropy = 0.
    optimizer.zero_grad()
    model.train()
    for idx,(_x,_y) in enumerate(tqdm_notebook(train_loader)):
        x_train,y_train = _x.cuda(),_y.cuda()
        pred = model(x_train,y_train,teacher_forcing_ratio=0.5)
        loss = criterion(pred,y_train)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        avg_loss += loss.item() / len(train_loader)
        
    val_loss = 0.
    val_entropy = 0.
    model.eval()
    with torch.no_grad():
        for idx,(_x,_y) in enumerate(tqdm_notebook(valid_loader)):
            x_val,y_val = _x.cuda(),_y.cuda()
            pred = model(x_val,y_val,teacher_forcing_ratio=0)
            loss= criterion(pred,y_val)
            val_loss += loss.item()/len(valid_loader)
    print("Epoch [%d]/[%d] train_loss %.6f valid_loss %.6f "%
          (epoch,n_epoch,avg_loss,val_loss))

HBox(children=(IntProgress(value=0, max=78), HTML(value='')))




RuntimeError: Expected object of scalar type Double but got scalar type Float for argument #2 'mat2'