# Imports

In [3]:
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Models

Model design made following Albadawi 2020, and open source github repos:
- https://github.com/liusongxiang/StarGAN-Voice-Conversion/blob/master/model.py
- https://github.com/pritishyuvraj/Voice-Conversion-GAN/blob/master/model.py

Creating the residual block for the encoder. Instance norm 2d normalises samples with respect to them self rather than all neighbouring samples in a batch (as is done in batch normalization).

In [4]:
class ResidualBlock(nn.Module):
    def __init__(self, dim_in, dim_out):
        super(ResidualBlock, self).__init__()
        self.main = nn.Sequential(
            nn.Conv2d(dim_in, dim_out, kernel_size=4, bias=False),
            nn.InstanceNorm2d(dim_out),
            nn.LeakyReLU(0.2),
            nn.Conv2d(dim_out, dim_out, kernel_size=4, bias=False),
            nn.InstanceNorm2d(dim_out))
        
    def forward(self, x):
        return x + self.main(x)

The rational for the encoder is to rid of all the non-timbral related information in its dimensionality reduction to the latent space. The generator then builds up the new voice from the rich latent space.

In [12]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
         
        self.l = []
        
        # Initial linear convolutional mapping
        self.l.append(nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=7, bias=False), 
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2)))
        
        # Non-linear mapping convolutional layers
        self.l.append(nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=4, bias=False, stride=2), 
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2)))
        
        self.l.append(nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=4, bias=False, stride=2),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.2)))
                 
        # Residual blocks with skip connections for bottleneck
        self.l.append(ResidualBlock(512, 1024))
        self.l.append(ResidualBlock(1024, 1156))
        self.l.append(ResidualBlock(1156, 1280))                  
        
        
    def forward(self, x):
        for layer in self.l:
            x = layer(x)
        return x    

Following Albadawi 2020, making the generator the reverse of the encoder for upsampling.

In [17]:
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        
        self.l = []
                
        # Residual blocks for expanding from latent space
        self.l.append(ResidualBlock(1280, 1156))
        self.l.append(ResidualBlock(1156, 1024))
        self.l.append(ResidualBlock(1024, 512)) 
        
        # Non-linear mapping convolutional layers
        self.l.append(nn.Sequential(
            nn.ConvTranspose2d(512, 256, kernel_size=4, bias=False, stride=2),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2)))
        
        self.l.append(nn.Sequential(
            nn.ConvTranspose2d(256, 128, kernel_size=4, bias=False, stride=2), 
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2)))
        
        # Final linear convolutional mapping 
        self.l.append(nn.Sequential(
            nn.ConvTranspose2d(128, 128, kernel_size=7, bias=False), 
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2)))
        
        
    def forward(self, x):
        for layer in self.l: 
            x = layer(x)
        return x               

In [16]:
class Dicriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        
        self.l = []
        
        self.l.append(nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=4, bias=False, stride=2),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2)))
        
        self.l.append(nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=4, bias=False, stride=2),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.2)))
        
        self.l.append(nn.Sequential(
            nn.Conv2d(512, 1024, kernel_size=4, bias=False, stride=2),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.2)))
        
        self.l.append(nn.Sequential(
            nn.Conv2d(1024, 1280, kernel_size=4, bias=False, stride=2),
            nn.BatchNorm2d(1280),
            nn.LeakyReLU(0.2)))
                
        self.l.append(nn.Sequential(
            nn.Conv2d(1280, 1, kernel_size=4, bias=False),
            nn.BatchNorm2d(1280),
            nn.LeakyReLU(0.2)))
         
    def forward(self, x):
        for layer in self.l: 
            x = layer(x)
        return x          