In [11]:
import os
import math
import random
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from encoder import Encoder
from attention import Attention
from decoder import Decoder
from generator import Generator

In [2]:
import librosa
import warnings
warnings.filterwarnings("ignore")

y, sr = librosa.load('data/sample/sample-000003.wav', sr=16000)
mel_spectrogram = librosa.feature.melspectrogram(y, sr)
print(mel_spectrogram.shape, sr)
src = torch.from_numpy(mel_spectrogram.reshape(mel_spectrogram.shape[0], 1, -1)).float() #.reshape(129, 1, 227)
src.size()

(128, 241) 16000


torch.Size([128, 1, 241])

In [12]:
SEQ_LEN = src.size(0)
INPUT_SIZE = src.size(2)
OUTPUT_DIM = 28
ENC_HID_DIM = 256
DEC_HID_DIM = 256 
DEC_EMB_DIM = 256
DROPOUT_RATE = 0.2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

encoder = Encoder(SEQ_LEN, INPUT_SIZE, ENC_HID_DIM, DEC_HID_DIM, DROPOUT_RATE)
attention = Attention(enc_hid_dim=ENC_HID_DIM, dec_hid_dim=DEC_HID_DIM)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DROPOUT_RATE, attention)
model = Generator(encoder, decoder, device).to(device)

In [13]:
model

Generator(
  (encoder): Encoder(
    (bi_gru1): GRU(241, 256, bidirectional=True)
    (bi_gru2): GRU(768, 256, bidirectional=True)
    (bi_gru3): GRU(768, 256, bidirectional=True)
    (bi_gru4): GRU(768, 256, bidirectional=True)
    (bi_gru5): GRU(768, 256, bidirectional=True)
    (bi_gru6): GRU(768, 256, bidirectional=True)
    (fc): Linear(in_features=512, out_features=256, bias=True)
    (pool): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (dropout): Dropout(p=0.2)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=768, out_features=256, bias=True)
    )
    (embedding): Embedding(28, 256)
    (gru): GRU(768, 256)
    (out): Linear(in_features=1024, out_features=28, bias=True)
    (dropout): Dropout(p=0.2)
  )
)