In [11]:
import os
import math
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [33]:
!ls 'data/sample/'

sample-000000.txt  sample-000001.wav  sample-000003.txt  sample-000004.wav
sample-000000.wav  sample-000002.txt  sample-000003.wav  sample-000005.txt
sample-000001.txt  sample-000002.wav  sample-000004.txt  sample-000005.wav


In [4]:
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [54]:
class Encoder(nn.Module):
    def __init__(self, input_size, enc_hid_dim, dec_hid_dim, dropout_rate):
        super().__init__()
        self.input_size = input_size
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout_rate = dropout_rate
        
        self.encoder = nn.Sequential(
                            nn.GRU(input_size, enc_hid_dim, bidirectional=True),
                            ) # nn.MaxPool2d(kernel_size=2, stride=1) 
        
    def forward(self, src, init_hidden=None):
        outputs, hidden_states =  self.encoder(src)
        return outputs, hidden_states
        #return self.encoder(src)

### Check Encoder

In [58]:
from scipy import signal
from scipy.io import wavfile
import soundfile as sf
import warnings
warnings.filterwarnings("ignore")

samples, sample_rate = sf.read('data/sample/sample-000000.wav')
frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
print(spectrogram.shape)
src = torch.from_numpy(spectrogram.reshape(129, 1, -1)).float() #.reshape(129, 1, 227)
src.size()

(129, 227)


torch.Size([129, 1, 227])

In [59]:
129

129

In [60]:
INPUT_SIZE = 227
ENC_HID_DIM = 256
DEC_HID_DIM = 256 
DROPOUT_RATE = 0.2

encoder = Encoder(INPUT_SIZE, ENC_HID_DIM, DEC_HID_DIM, DROPOUT_RATE)
encoder_outputs, encoder_hidden_states = encoder(src)
encoder_outputs.size(), encoder_hidden_states.size()

(torch.Size([129, 1, 512]), torch.Size([2, 1, 256]))

In [72]:
new_hidden_states = torch.cat((encoder_hidden_states.repeat(64, 1, 1), torch.zeros(1, 1, 256)))
new_hidden_states.size()

torch.Size([129, 1, 256])

In [73]:
total_out = torch.cat((encoder_outputs, new_hidden_states), dim=2)
total_out.size()

torch.Size([129, 1, 768])

In [76]:
pool = nn.MaxPool2d(2)
pool(total_out.permute(1, 0, 2)).size()

torch.Size([1, 64, 384])

In [75]:
total_out.permute(1, 0, 2).size()

torch.Size([1, 129, 768])