In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from torch.autograd import Variable
import librosa

In [2]:
df=pd.read_csv("cv_corpus_v1/cv_corpus_v1/cv-valid-train.csv")
df=df.dropna(thresh=5)
df.head()

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
5,cv-valid-train/sample-000005.mp3,a shepherd may like to travel but he should ne...,1,0,twenties,female,us,
8,cv-valid-train/sample-000008.mp3,put jackie right on the staff,3,0,seventies,male,us,
13,cv-valid-train/sample-000013.mp3,but he had found a guide and didn't want to mi...,1,0,thirties,female,us,
14,cv-valid-train/sample-000014.mp3,as they began to decorate the hallway a silhou...,1,0,sixties,male,england,
19,cv-valid-train/sample-000019.mp3,then they got ahold of some dough and went goofy,1,0,fifties,male,australia,


In [47]:
ar=df["filename"].as_matrix()
ar=ar[:5000]

ms, _ =librosa.load("cv_corpus_v1/cv_corpus_v1/"+ar[0])
ms=ms[:25000]
k = librosa.feature.melspectrogram(y=ms,hop_length=400,n_fft=1600,n_mels=80)
k.shape

(80, 63)

In [105]:
class PreNet(nn.Module):
    def __init__(self):
        super(PreNet,self).__init__()
        self.layer = nn.Sequential(
                        nn.Linear(63,128),
                        nn.ELU(),
                        nn.Linear(128,128),
                        nn.ELU(),
                        nn.Linear(128,11),
                        nn.ELU()
        )
        
    def forward(self,x):
        x = self.layer(x)
        return x
    
pnet = PreNet()
        

In [164]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet,self).__init__()
        self.layer1 = nn.Sequential(
                        nn.Conv1d(80,80,12,padding=35),
                        nn.GLU(),
                        nn.Conv1d(80,80,12,padding=11),
                        nn.GLU(),
                        nn.Conv1d(80,80,12,padding=5),
                        nn.GLU()
        )
        
    def forward(self,x):
        x = self.layer1(x)
       
        return x
    
cnet = ConvNet()
        

In [167]:
z = Variable(torch.from_numpy(k).type(torch.FloatTensor))
z = z.view(1,1,80,63)
o = pnet(z)
o = o.view(1,80,11)

o2 = cnet(o)


newo = o+o2
newo.data.shape


torch.Size([1, 80, 11])

In [169]:
class PreEncoderLayer(nn.Module):
    def __init__(self):
        super(PreEncoderLayer,self).__init__()
        self.layer = nn.Sequential(
                        nn.Linear(11,128),
                        nn.ELU()
        )
        
    def forward(self,x):
        x = self.layer(x)
        return x
    
PEnLnet = PreEncoderLayer()

In [180]:
keys = PEnLnet(newo)
queries = values = keys
keys.data.shape



torch.Size([1, 80, 128])

In [184]:
import torch.nn.functional as F
from torch.nn.parameter import Parameter




class MultiHeadAttention(nn.Module):
    def __init__(self,
                 query_dim,
                 key_dim,
                 num_units,
                 dropout_p=0.5,
                 h=2,
                 is_masked=False):
        super(MultiHeadAttention, self).__init__()

        if query_dim != key_dim:
            raise ValueError("query_dim and key_dim must be the same")
        if num_units % h != 0:
            raise ValueError("num_units must be dividable by h")
        if query_dim != num_units:
            raise ValueError("to employ residual connection, the number of "
                             "query_dim and num_units must be the same")

        self._num_units = num_units
        self._h = h
        self._key_dim = Variable(torch.FloatTensor([key_dim]))
        self._dropout_p = dropout_p
        self._is_masked = is_masked

        self.query_layer = nn.Linear(query_dim, num_units, bias=False)
        self.key_layer = nn.Linear(key_dim, num_units, bias=False)
        self.value_layer = nn.Linear(key_dim, num_units, bias=False)
        self.bn = nn.BatchNorm1d(num_units)

    def forward(self, query, keys):
        Q = self.query_layer(query)
        K = self.key_layer(keys)
        V = self.value_layer(keys)

        # split each Q, K and V into h different values from dim 2
        # and then merge them back together in dim 0
        chunk_size = int(self._num_units / self._h)
        Q = torch.cat(Q.split(split_size=chunk_size, dim=2), dim=0)
        K = torch.cat(K.split(split_size=chunk_size, dim=2), dim=0)
        V = torch.cat(V.split(split_size=chunk_size, dim=2), dim=0)

        # calculate QK^T
        attention = torch.matmul(Q, K.transpose(1, 2))
        # normalize with sqrt(dk)
        attention = attention / torch.sqrt(self._key_dim)
        # use masking (usually for decoder) to prevent leftward
        # information flow and retains auto-regressive property
        # as said in the paper
        if self._is_masked:
            diag_vals = attention[0].sign().abs()
            diag_mat = diag_vals.tril()
            diag_mat = diag_mat.unsqueeze(0).expand(attention.size())
            # we need to enforce converting mask to Variable, since
            # in pytorch we can't do operation between Tensor and
            # Variable
            mask = Variable(
                torch.ones(diag_mat.size()) * (-2**32 + 1), requires_grad=False)
            # this is some trick that I use to combine the lower diagonal
            # matrix and its masking. (diag_mat-1).abs() will reverse the value
            # inside diag_mat, from 0 to 1 and 1 to zero. with this
            # we don't need loop operation andn could perform our calculation
            # faster
            attention = (attention * diag_mat) + (mask * (diag_mat-1).abs())
        # put it to softmax
        attention = F.softmax(attention, dim=-1)
        # apply dropout
        attention = F.dropout(attention, self._dropout_p)
        # multiplyt it with V
        attention = torch.matmul(attention, V)
        # convert attention back to its input original size
        restore_chunk_size = int(attention.size(0) / self._h)
        attention = torch.cat(
            attention.split(split_size=restore_chunk_size, dim=0), dim=2)
        # residual connection
        attention += query
        # apply batch normalization
        attention = self.bn(attention.transpose(1, 2)).transpose(1, 2)

        return attention


In [187]:
attention = MultiHeadAttention(128,128,128)