### Importing Dependencies

In [1]:
%%bash
pip install numpy scipy librosa unidecode inflect librosa
apt-get update
apt-get install -y libsndfile1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:2 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:6 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:10 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu

In [2]:
import torch
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.functional as F
import math
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
import librosa
import librosa.display
import IPython.display as ipd
from torchsummary import summary
from google.colab import drive
import torchvision.models as models

In [3]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Unzipping Dataset from gdrive

In [4]:
!unzip gdrive/My\ Drive/DL_project/dataset/lec_01_audio_segmented.zip

Archive:  gdrive/My Drive/DL_project/dataset/lec_01_audio_segmented.zip
replace lec_01_audio_segmented/seg_1.wav? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [5]:
!unzip gdrive/My\ Drive/DL_project/dataset/lec_01_segmented_frames.zip

Archive:  gdrive/My Drive/DL_project/dataset/lec_01_segmented_frames.zip
replace lec_01_segmented_frames/seg_1/frame_0.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


##### Device Configuration

In [6]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


### Defining Different Models

### Load pretrained Tacotron2 Model

In [7]:
tacotron2 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp16')
tacotron2 = tacotron2.to(device)
tacotron2.eval()

Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
  "pytorch_quantization module not found, quantization will not be available"
  "pytorch_quantization module not found, quantization will not be available"


Tacotron2(
  (embedding): Embedding(148, 512)
  (encoder): Encoder(
    (convolutions): ModuleList(
      (0): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (2): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (lstm): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (prenet): Prenet(
      (layers): ModuleList(
        (0): LinearNorm(
          (lin

In [8]:
params = tacotron2.state_dict()
params.keys()

odict_keys(['embedding.weight', 'encoder.convolutions.0.0.conv.weight', 'encoder.convolutions.0.0.conv.bias', 'encoder.convolutions.0.1.weight', 'encoder.convolutions.0.1.bias', 'encoder.convolutions.0.1.running_mean', 'encoder.convolutions.0.1.running_var', 'encoder.convolutions.0.1.num_batches_tracked', 'encoder.convolutions.1.0.conv.weight', 'encoder.convolutions.1.0.conv.bias', 'encoder.convolutions.1.1.weight', 'encoder.convolutions.1.1.bias', 'encoder.convolutions.1.1.running_mean', 'encoder.convolutions.1.1.running_var', 'encoder.convolutions.1.1.num_batches_tracked', 'encoder.convolutions.2.0.conv.weight', 'encoder.convolutions.2.0.conv.bias', 'encoder.convolutions.2.1.weight', 'encoder.convolutions.2.1.bias', 'encoder.convolutions.2.1.running_mean', 'encoder.convolutions.2.1.running_var', 'encoder.convolutions.2.1.num_batches_tracked', 'encoder.lstm.weight_ih_l0', 'encoder.lstm.weight_hh_l0', 'encoder.lstm.bias_ih_l0', 'encoder.lstm.bias_hh_l0', 'encoder.lstm.weight_ih_l0_reve

In [54]:
tacotron2.embedding = nn.Conv1d(148, 512, kernel_size=(3,), stride=(1,), padding=(2,))

In [55]:
for name, param in tacotron2.named_parameters():
    if param.requires_grad and 'encoder' in name:
        param.requires_grad = True
    else:
      param.requires_grad = False
tacotron2.embedding.weight.requires_grad = True 

In [56]:
for name, param in tacotron2.named_parameters():
    if param.requires_grad:
      print(f"name : {name}, parameter: {param.shape}")

name : embedding.weight, parameter: torch.Size([512, 148, 3])
name : encoder.convolutions.0.0.conv.weight, parameter: torch.Size([512, 512, 5])
name : encoder.convolutions.0.0.conv.bias, parameter: torch.Size([512])
name : encoder.convolutions.0.1.weight, parameter: torch.Size([512])
name : encoder.convolutions.0.1.bias, parameter: torch.Size([512])
name : encoder.convolutions.1.0.conv.weight, parameter: torch.Size([512, 512, 5])
name : encoder.convolutions.1.0.conv.bias, parameter: torch.Size([512])
name : encoder.convolutions.1.1.weight, parameter: torch.Size([512])
name : encoder.convolutions.1.1.bias, parameter: torch.Size([512])
name : encoder.convolutions.2.0.conv.weight, parameter: torch.Size([512, 512, 5])
name : encoder.convolutions.2.0.conv.bias, parameter: torch.Size([512])
name : encoder.convolutions.2.1.weight, parameter: torch.Size([512])
name : encoder.convolutions.2.1.bias, parameter: torch.Size([512])
name : encoder.lstm.weight_ih_l0, parameter: torch.Size([1024, 512])

In [57]:
tacotron2 = tacotron2.to(device)

### Load pretrained WaveGlow model

In [12]:
waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp16')
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cuda')
waveglow.eval()

Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub


WaveGlow(
  (upsample): ConvTranspose1d(80, 80, kernel_size=(1024,), stride=(256,))
  (WN): ModuleList(
    (0): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(51

In [13]:
for name, param in waveglow.named_parameters():
    if param.requires_grad:
        param.requires_grad = False

### Load Pretrained r3d_18 (3D-ResNet) Model

In [30]:
r3d_18 = models.video.r3d_18(pretrained=False)
encoder_model = r3d_18.to(device)

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "


In [31]:
encoder_model.eval()

VideoResNet(
  (stem): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1):

In [None]:
#@title Model modifications
encoder_model.fc = nn.Linear(in_features=512, out_features=148, bias=True)

In [32]:
for name, param in encoder_model.named_parameters():
    if param.requires_grad:
      print(f"name : {name}, parameter: {param.shape}")

name : stem.0.weight, parameter: torch.Size([64, 3, 3, 7, 7])
name : stem.1.weight, parameter: torch.Size([64])
name : stem.1.bias, parameter: torch.Size([64])
name : layer1.0.conv1.0.weight, parameter: torch.Size([64, 64, 3, 3, 3])
name : layer1.0.conv1.1.weight, parameter: torch.Size([64])
name : layer1.0.conv1.1.bias, parameter: torch.Size([64])
name : layer1.0.conv2.0.weight, parameter: torch.Size([64, 64, 3, 3, 3])
name : layer1.0.conv2.1.weight, parameter: torch.Size([64])
name : layer1.0.conv2.1.bias, parameter: torch.Size([64])
name : layer1.1.conv1.0.weight, parameter: torch.Size([64, 64, 3, 3, 3])
name : layer1.1.conv1.1.weight, parameter: torch.Size([64])
name : layer1.1.conv1.1.bias, parameter: torch.Size([64])
name : layer1.1.conv2.0.weight, parameter: torch.Size([64, 64, 3, 3, 3])
name : layer1.1.conv2.1.weight, parameter: torch.Size([64])
name : layer1.1.conv2.1.bias, parameter: torch.Size([64])
name : layer2.0.conv1.0.weight, parameter: torch.Size([128, 64, 3, 3, 3])
na

### Defining different Encoder Architecture

In [14]:
def conv3x3(in_planes, out_planes, strd=1, padding=1, bias=False):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3,
                     stride=strd, padding=padding, bias=bias)


class ConvBlock(nn.Module):
    def __init__(self, in_planes, out_planes):
        super(ConvBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = conv3x3(in_planes, int(out_planes / 2))
        self.bn2 = nn.BatchNorm2d(int(out_planes / 2))
        self.conv2 = conv3x3(int(out_planes / 2), int(out_planes / 4))
        self.bn3 = nn.BatchNorm2d(int(out_planes / 4))
        self.conv3 = conv3x3(int(out_planes / 4), int(out_planes / 4))

        if in_planes != out_planes:
            self.downsample = nn.Sequential(
                nn.BatchNorm2d(in_planes),
                nn.ReLU(True),
                nn.Conv2d(in_planes, out_planes,
                          kernel_size=1, stride=1, bias=False),
            )
        else:
            self.downsample = None

    def forward(self, x):
        residual = x

        out1 = self.bn1(x)
        out1 = F.relu(out1, True)
        out1 = self.conv1(out1)

        out2 = self.bn2(out1)
        out2 = F.relu(out2, True)
        out2 = self.conv2(out2)

        out3 = self.bn3(out2)
        out3 = F.relu(out3, True)
        out3 = self.conv3(out3)

        out3 = torch.cat((out1, out2, out3), 1)

        if self.downsample is not None:
            residual = self.downsample(residual)

        out3 += residual

        return out3


class Bottleneck(nn.Module):

    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class HourGlass(nn.Module):
    def __init__(self, num_modules, depth, num_features):
        super(HourGlass, self).__init__()
        self.num_modules = num_modules
        self.depth = depth
        self.features = num_features

        self._generate_network(self.depth)

    def _generate_network(self, level):
        self.add_module('b1_' + str(level), ConvBlock(self.features, self.features))

        self.add_module('b2_' + str(level), ConvBlock(self.features, self.features))

        if level > 1:
            self._generate_network(level - 1)
        else:
            self.add_module('b2_plus_' + str(level), ConvBlock(self.features, self.features))

        self.add_module('b3_' + str(level), ConvBlock(self.features, self.features))

    def _forward(self, level, inp):
        # Upper branch
        up1 = inp
        up1 = self._modules['b1_' + str(level)](up1)

        # Lower branch
        low1 = F.avg_pool2d(inp, 2, stride=2)
        low1 = self._modules['b2_' + str(level)](low1)

        if level > 1:
            low2 = self._forward(level - 1, low1)
        else:
            low2 = low1
            low2 = self._modules['b2_plus_' + str(level)](low2)

        low3 = low2
        low3 = self._modules['b3_' + str(level)](low3)

        up2 = F.interpolate(low3, scale_factor=2, mode='nearest')

        return up1 + up2

    def forward(self, x):
        return self._forward(self.depth, x)


class FAN(nn.Module):

    def __init__(self, num_modules=1):
        super(FAN, self).__init__()
        self.num_modules = num_modules

        # Base part
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = ConvBlock(64, 128)
        self.conv3 = ConvBlock(128, 128)
        self.conv4 = ConvBlock(128, 256)

        # Stacking part
        for hg_module in range(self.num_modules):
            self.add_module('m' + str(hg_module), HourGlass(1, 4, 256))
            self.add_module('top_m_' + str(hg_module), ConvBlock(256, 256))
            self.add_module('conv_last' + str(hg_module),
                            nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
            self.add_module('bn_end' + str(hg_module), nn.BatchNorm2d(256))
            self.add_module('l' + str(hg_module), nn.Conv2d(256,
                                                            68, kernel_size=1, stride=1, padding=0))

            if hg_module < self.num_modules - 1:
                self.add_module(
                    'bl' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
                self.add_module('al' + str(hg_module), nn.Conv2d(68,
                                                                 256, kernel_size=1, stride=1, padding=0))

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)), True)
        x = F.avg_pool2d(self.conv2(x), 2, stride=2)
        x = self.conv3(x)
        x = self.conv4(x)

        previous = x

        outputs = []
        for i in range(self.num_modules):
            hg = self._modules['m' + str(i)](previous)

            ll = hg
            ll = self._modules['top_m_' + str(i)](ll)

            ll = F.relu(self._modules['bn_end' + str(i)]
                        (self._modules['conv_last' + str(i)](ll)), True)

            # Predict heatmaps
            tmp_out = self._modules['l' + str(i)](ll)
            outputs.append(tmp_out)

            if i < self.num_modules - 1:
                ll = self._modules['bl' + str(i)](ll)
                tmp_out_ = self._modules['al' + str(i)](tmp_out)
                previous = previous + ll + tmp_out_

        return outputs


In [15]:
#@title ResNetDepth
class ResNetDepth(nn.Module):
    def __init__(self, block=Bottleneck, layers=[1, 1, 1, 1], num_classes=68): #36 tha
        self.inplanes = 64
        super(ResNetDepth, self).__init__()
        self.conv1 = nn.Conv2d(3 * 75, 64, kernel_size=3, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

In [16]:
#@title My Encoder

class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.conv1 = nn.Conv2d(in_channels = 225, out_channels = 6, kernel_size = (5,5))
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(13456, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 142)

    def forward(self, x):
        # -> n, 3, 32, 32
        x = self.pool(F.relu(self.conv1(x)))  # -> n, 6, 14, 14
        x = self.pool(F.relu(self.conv2(x)))  # -> n, 16, 5, 5
        x = x.view(-1, 13456)            # -> n, 400
        x = F.relu(self.fc1(x))               # -> n, 120
        x = F.relu(self.fc2(x))               # -> n, 84
        x = self.fc3(x)                       # -> n, 10
        return x

In [17]:
for name, param in Encoder().named_parameters():
    if param.requires_grad:
      print(f"name : {name}, parameter: {param.shape}")

name : conv1.weight, parameter: torch.Size([6, 225, 5, 5])
name : conv1.bias, parameter: torch.Size([6])
name : conv2.weight, parameter: torch.Size([16, 6, 5, 5])
name : conv2.bias, parameter: torch.Size([16])
name : fc1.weight, parameter: torch.Size([120, 13456])
name : fc1.bias, parameter: torch.Size([120])
name : fc2.weight, parameter: torch.Size([84, 120])
name : fc2.bias, parameter: torch.Size([84])
name : fc3.weight, parameter: torch.Size([142, 84])
name : fc3.bias, parameter: torch.Size([142])


### Image-Sequence data loader

In [18]:
seg_path = r"/content/lec_01_segmented_frames"

In [19]:
lower = 100
upper = 200

In [20]:
#@title Segment dataloading functions

IMG_SIZE = 128

def seg_sorter(seg: str):
    return int(seg[4:])

def load_segment(dir:str, start : int, end: int):
    data_path = dir
    data_arr = []
    path = os.listdir(data_path)
    path.sort(key=seg_sorter)
    for file in path[start:end]:
        seg_arr = []
        print(f"loaded : {file}")
        sub_path = os.path.join(data_path,file)
        for img in os.listdir(sub_path):
            img_path = os.path.join(data_path,file,img)
            a = cv2.imread(img_path)
            a = cv2.cvtColor(a, cv2.COLOR_BGR2RGB)
            a = cv2.resize(a, (IMG_SIZE, IMG_SIZE)) 
            seg_arr.append(a)
        data_arr.append(seg_arr)
    nd_arr = np.array(data_arr)
    return nd_arr


def plot_segment(seg : np.ndarray):
    fig = plt.figure(figsize=(15,13))
    for i in range(75):  
        ax = fig.add_subplot(8, 10, i+1)
        ax.imshow(seg[i,:,:,:])
        ax.set_title('frame:{y}'.format(y=i))
        plt.axis('off')

In [21]:
X = load_segment(seg_path, start = lower, end = upper)

loaded : seg_101
loaded : seg_102
loaded : seg_103
loaded : seg_104
loaded : seg_105
loaded : seg_106
loaded : seg_107
loaded : seg_108
loaded : seg_109
loaded : seg_110
loaded : seg_111
loaded : seg_112
loaded : seg_113
loaded : seg_114
loaded : seg_115
loaded : seg_116
loaded : seg_117
loaded : seg_118
loaded : seg_119
loaded : seg_120
loaded : seg_121
loaded : seg_122
loaded : seg_123
loaded : seg_124
loaded : seg_125
loaded : seg_126
loaded : seg_127
loaded : seg_128
loaded : seg_129
loaded : seg_130
loaded : seg_131
loaded : seg_132
loaded : seg_133
loaded : seg_134
loaded : seg_135
loaded : seg_136
loaded : seg_137
loaded : seg_138
loaded : seg_139
loaded : seg_140
loaded : seg_141
loaded : seg_142
loaded : seg_143
loaded : seg_144
loaded : seg_145
loaded : seg_146
loaded : seg_147
loaded : seg_148
loaded : seg_149
loaded : seg_150
loaded : seg_151
loaded : seg_152
loaded : seg_153
loaded : seg_154
loaded : seg_155
loaded : seg_156
loaded : seg_157
loaded : seg_158
loaded : seg_1

### Audio data loader

In [22]:
audio_path = r"/content/lec_01_audio_segmented"

In [23]:
#@title Audio loading function

def audio_seg_sorter(seg: str):
    splitted = seg.split('.')
    return int(splitted[0][4:])

def load_audio(dir:str, start : int, end: int):
  data_arr = []
  path = os.listdir(dir)
  path.sort(key=audio_seg_sorter)
  #print(path)
  for file in path[start:end]:
      seg_arr = []
      print(file)
      seg_file_temp = os.path.join(dir,file)
      audio_seg_temp = librosa.load(seg_file_temp)
      data_arr.append(audio_seg_temp)
      print(seg_file_temp)
  return data_arr

def audio_to_mel_display(audio : np.ndarray):
  spec = np.abs(librosa.stft(audio, hop_length=512))
  spec = librosa.amplitude_to_db(spec, ref=np.max)
  print(spec.shape)
  librosa.display.specshow(spec, sr=22050, x_axis='time', y_axis='log')
  plt.colorbar(format='%+2.0f dB')
  plt.title('Spectrogram') 

def audio_to_mel_display_grid(audio_arr,number):
  fig = plt.figure(figsize=(15,13))
  for i in range(number):  
      ax = fig.add_subplot(8, 10, i+1)
      spec = np.abs(librosa.stft(audio_arr[i][0], hop_length=512))
      spec = librosa.amplitude_to_db(spec, ref=np.max)
      librosa.display.specshow(spec, sr=22050, x_axis='time', y_axis='log')
      #ax.colorbar(format='%+2.0f dB')
      ax.set_title('seg:{y}'.format(y=i))
      ax.axis('off')  

def listen_audio(audio,sr):
   ipd.Audio(audio, rate=sr)     

In [24]:
Y = load_audio(audio_path, start = lower, end = upper)

seg_101.wav
/content/lec_01_audio_segmented/seg_101.wav
seg_102.wav
/content/lec_01_audio_segmented/seg_102.wav
seg_103.wav
/content/lec_01_audio_segmented/seg_103.wav
seg_104.wav
/content/lec_01_audio_segmented/seg_104.wav
seg_105.wav
/content/lec_01_audio_segmented/seg_105.wav
seg_106.wav
/content/lec_01_audio_segmented/seg_106.wav
seg_107.wav
/content/lec_01_audio_segmented/seg_107.wav
seg_108.wav
/content/lec_01_audio_segmented/seg_108.wav
seg_109.wav
/content/lec_01_audio_segmented/seg_109.wav
seg_110.wav
/content/lec_01_audio_segmented/seg_110.wav
seg_111.wav
/content/lec_01_audio_segmented/seg_111.wav
seg_112.wav
/content/lec_01_audio_segmented/seg_112.wav
seg_113.wav
/content/lec_01_audio_segmented/seg_113.wav
seg_114.wav
/content/lec_01_audio_segmented/seg_114.wav
seg_115.wav
/content/lec_01_audio_segmented/seg_115.wav
seg_116.wav
/content/lec_01_audio_segmented/seg_116.wav
seg_117.wav
/content/lec_01_audio_segmented/seg_117.wav
seg_118.wav
/content/lec_01_audio_segmented/seg_

### DataShuffling and Processing


In [25]:
#@title test-train split
train_X = X[:960] # 70% 
train_Y = Y[:960] # 70% 

test_X = X[960:] # 30% 
test_Y = Y[960:] # 30% 

In [26]:
train_len = len(train_X)
train = []
for i in range(train_len):
  train.append((train_X[i],train_Y[i]))

In [27]:
test_len = len(test_X)
test = []
for i in range(test_len):
  test.append((test_X[i],test_Y[i]))

In [28]:
trainset = torch.utils.data.DataLoader(train, batch_size=1, shuffle=True)
testset = torch.utils.data.DataLoader(test, batch_size=1, shuffle=False)

### All Encoder Models Instantiated

### Hyperparameter, Optimizer and Loss Setting

In [29]:
#@title Hyper-Parameters
num_epochs = 5
batch_size = 4
learning_rate = 0.001

In [34]:
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

### Train Loop

In [58]:
from torch.autograd import forward_ad
n_total_steps = len(trainset)


for epoch in range(num_epochs):
    for i, (seg, aud) in enumerate(trainset):
        mel = np.asarray(aud)
        print(seg.shape,mel[0].shape)
        seg = seg.reshape((1,3,75,128,128))
        seg = seg.float()
        print(seg.shape,seg.dtype)

        seg = seg.to(device)
        aud = aud[0].to(device)

        # Forward pass
        encoded = model(seg)

        print(encoded.shape,encoded.dtype)
        #print(encoded)

        length = encoded.T.shape
        length = torch.tensor(length)
        length = length.to(device)

        mel, _ , _ = tacotron2.infer(encoded.T,length)
      
        print(mel.shape)
        audio = waveglow.infer(mel)
        loss = criterion(audio, aud)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 2 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')
print('Finished Training')

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


torch.Size([1, 75, 128, 128, 3]) torch.Size([1, 66150])
torch.Size([1, 3, 75, 128, 128]) torch.float32
torch.Size([1, 148])
tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan]], device='cuda:0', grad_fn=<AddmmBackward0>)


IndexError: ignored

### Testing Loop

In [None]:
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    n_class_correct = [0 for i in range(10)]
    n_class_samples = [0 for i in range(10)]
    for seg, aud in testset:
        seg = seg.to(device)
        aud = aud.to(device)
        outputs = model(seg)
        # max returns (value ,index)
        _, predicted = torch.max(outputs, 1)
        n_samples += mel.size(0)
        n_correct += (predicted == mel).sum().item()

### Save Model to \<path\>

In [None]:
PATH = './cnn.pth'
torch.save(model.state_dict(), PATH)