# Importing Modules

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
import numpy as np
import joblib

In [2]:
print( tf.__version__)

2.16.1


# Generator

In [3]:
def initialize_generator(input_dim,output_dim):
        model = models.Sequential([
        # layers.Dense(256, input_dim=input_dim),
        # layers.ReLU(),
        # layers.Dense(512),
        # layers.ReLU(),
        # layers.Dense(output_dim),

        layers.Dense(512, input_dim=input_dim),
        layers.ReLU(),
        layers.Dense(512),
        layers.ReLU(),
        layers.Dense(512),
        layers.ReLU(),
        layers.Dense(output_dim),
    ])
        return model

# Discriminator 

In [4]:
def initialize_discriminator(input_dim):
    model = models.Sequential([
    # layers.Dense(512, input_dim=input_dim),
    # layers.LeakyReLU(alpha=0.2),
    # layers.Dense(256),
    # layers.LeakyReLU(alpha=0.2),
    # layers.Dense(1, activation='sigmoid'),

    layers.Dense(512, input_dim=input_dim),
    layers.LeakyReLU(alpha=0.2),
    layers.Dense(512),
    layers.LeakyReLU(alpha=0.2),
    layers.Dense(512),
    layers.LeakyReLU(alpha=0.2),
    layers.Dense(1, activation='sigmoid'),
    ])
    return model

# Loading and prepareing Data for training 

In [5]:
def create_features_DataLoader(batch_size=64):
    
    # Initializing the features files path
    nam_mfcc_path=r"C:\Users\SID\Documents\Speech\voice_conversion_gan\data\processed\mfcc_nam.pkl"
    whsp_mfcc_path=r"C:\Users\SID\Documents\Speech\voice_conversion_gan\data\processed\mfcc_headset.pkl"
    
    # loading the features from the pickle file 
    nam_mfcc=joblib.load(nam_mfcc_path)
    whsp_mfcc=joblib.load(whsp_mfcc_path)
    
    # convert the numpy array into Tensor 
    nam_mfcc = tf.convert_to_tensor(nam_mfcc, dtype=tf.float32)
    whsp_mfcc = tf.convert_to_tensor(whsp_mfcc, dtype=tf.float32)
    dataset = tf.data.Dataset.from_tensor_slices((nam_mfcc, whsp_mfcc))
    
    return dataset.batch(batch_size)

# Initial Stage 

In [6]:
# # Hyperparameters
# input_dim = 62100  # As we use 20 number of mfcc features and 3105 time steps  to be extracted for the audio file
# output_dim = 62100  # Same as input_dim

input_dim = 40365 # As we use 13 number of mfcc features and 3105 time steps  to be extracted for the audio file
output_dim = 40365  # Same as input_dim
lr = 0.001
batch_size = 64
epochs = 100

# Instantiate models 

# Genrator for NAM to Whisper 
G_NAM2WHSP=initialize_generator(input_dim=input_dim,output_dim=output_dim)
# Genrator for Whisper to Nam
G_WHSP2NAM=initialize_generator(input_dim=input_dim,output_dim=output_dim)
# Discriminator for NAM
D_NAM=initialize_discriminator(input_dim=input_dim)
# Discriminator for Whsp
D_WHSP=initialize_discriminator(input_dim=input_dim)

# Define loss functions
adversarial_loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
huber_loss = tf.keras.losses.Huber()
# mmse_loss = tf.keras.losses.MeanSquaredError()

# Define optimizers
optimizer_G = optimizers.Adam(lr, beta_1=0.5, beta_2=0.999)
optimizer_D_W = optimizers.Adam(lr, beta_1=0.5, beta_2=0.999)
optimizer_D_S = optimizers.Adam(lr, beta_1=0.5, beta_2=0.999)

# Data Loaders
dataloader =create_features_DataLoader(batch_size)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training Genrators and Discriminators

In [7]:
for epoch in range (epochs):
    for i,(nam,whsp) in enumerate(dataloader):        
        # An Array with size 64 ie batch size , with all 1's
        valid = tf.ones((nam.shape[0],1))

        # An Array with size 64 ie batch size , with all 0's
        fake=tf.zeros((nam.shape[0],1))
        # break
        with tf.GradientTape() as tape_G, tf.GradientTape() as tape_D_NAM, tf.GradientTape() as tape_D_WHSP:
            # Generator NAM -> WHSP
            gen_whsp=G_NAM2WHSP(nam)
            validity_whsp=D_WHSP(gen_whsp)
            g_loss_whsp=adversarial_loss(valid,validity_whsp)
            gen_whsp_loss=huber_loss(gen_whsp,whsp)
            # Generator WHSP -> NAM
            gen_nam=G_WHSP2NAM(whsp)
            validity_nam=D_NAM(gen_nam)
            g_loss_nam=adversarial_loss(valid,validity_nam)
            gen_nam_loss=huber_loss(gen_nam,nam)

            # Reconstruction Loss
            
            # gen_whiper to NAM Reconstruction
            recon_nam=G_WHSP2NAM(gen_whsp)
            recon_loss_nam = huber_loss(nam, recon_nam)
           

            # gen_nam to whisper Reconstruction
            recon_whsp=G_NAM2WHSP(gen_nam)
            recon_loss_whsp = huber_loss(whsp, recon_whsp)
            
            # Total Generator Loss
            g_loss= gen_whsp_loss+gen_nam_loss+g_loss_whsp+g_loss_nam+recon_loss_nam+recon_loss_whsp
           

            # Discriminator Loss for NAM 
            real_pred_nam=D_NAM(nam)
            fake_pred_nam=D_NAM(gen_nam)

            d_loss_real_nam=adversarial_loss(valid,real_pred_nam)
            d_loss_fake_nam=adversarial_loss(fake,fake_pred_nam)

            d_loss_nam=(d_loss_fake_nam+d_loss_real_nam)/2

           
            # Discriminator Loss for NAM 
            real_pred_whsp=D_WHSP(whsp)
            fake_pred_whsp=D_WHSP(gen_nam)

            d_loss_real_whsp=adversarial_loss(valid,real_pred_whsp)
            d_loss_fake_whsp=adversarial_loss(fake,fake_pred_whsp)

            d_loss_whsp=(d_loss_fake_whsp+d_loss_real_whsp)/2
            
            # Backpropogation for generators 
            
            grads_G= tape_G.gradient(g_loss,G_NAM2WHSP.trainable_variables +G_WHSP2NAM.trainable_variables)
            optimizer_G.apply_gradients(zip(grads_G, G_NAM2WHSP.trainable_variables +G_WHSP2NAM.trainable_variables))

            # Backpropagation for discriminator NAM
            grads_D_W = tape_D_NAM.gradient(d_loss_nam, D_NAM.trainable_variables)
            optimizer_D_W.apply_gradients(zip(grads_D_W, D_NAM.trainable_variables)) 

            # Backpropagation for discriminator S
            grads_D_S = tape_D_WHSP.gradient(d_loss_whsp, D_WHSP.trainable_variables)
            optimizer_D_S.apply_gradients(zip(grads_D_S, D_WHSP.trainable_variables))
            print(f"[Epoch {epoch}/{epochs}] [Batch {i}/{len(dataloader)}] [d_loss_nam : {d_loss_nam.numpy()}] [d_loss_whsp : {d_loss_whsp.numpy()}] [G loss: {g_loss.numpy()}]")

            

[Epoch 0/100] [Batch 0/7] [d_loss_nam : 0.15789416432380676] [d_loss_whsp : 79.52750396728516] [G loss: 251.2281494140625]
[Epoch 0/100] [Batch 1/7] [d_loss_nam : 0.0] [d_loss_whsp : 0.0] [G loss: 2369.2822265625]
[Epoch 0/100] [Batch 2/7] [d_loss_nam : 17132.806640625] [d_loss_whsp : 14931.09765625] [G loss: 976.5662841796875]
[Epoch 0/100] [Batch 3/7] [d_loss_nam : 747.5006103515625] [d_loss_whsp : 0.0] [G loss: 80353.8671875]
[Epoch 0/100] [Batch 4/7] [d_loss_nam : 4988.57421875] [d_loss_whsp : 1969.8265380859375] [G loss: 381.8114013671875]
[Epoch 0/100] [Batch 5/7] [d_loss_nam : 0.0] [d_loss_whsp : 9156.3310546875] [G loss: 5218.5947265625]
[Epoch 0/100] [Batch 6/7] [d_loss_nam : 7831.05126953125] [d_loss_whsp : 24312.263671875] [G loss: 644.2354736328125]
[Epoch 1/100] [Batch 0/7] [d_loss_nam : 0.0] [d_loss_whsp : 0.0] [G loss: 133430.0]
[Epoch 1/100] [Batch 1/7] [d_loss_nam : 472.80499267578125] [d_loss_whsp : 1029.94921875] [G loss: 6225.861328125]
[Epoch 1/100] [Batch 2/7] [d_

Saving the models

In [8]:
models_path = r'C:\Users\SID\Documents\Speech\voice_conversion_gan\models'
# Genrator for NAM to Whisper 
G_NAM2WHSP.save(models_path+r"\G_NAM2WHSP.keras")
# Genrator for Whisper to Nam
G_WHSP2NAM.save(models_path+r"\G_WHSP2NAM.keras")
# Discriminator for NAM
D_NAM.save(models_path+r"\D_NAM.keras")
# Discriminator for Whsp
D_WHSP.save(models_path+r"\D_WHSP.keras")

In [9]:
G_NAM2WHSP.summary()

In [10]:
G_WHSP2NAM.summary()

In [11]:
D_NAM.summary()

In [12]:
D_WHSP.summary()

In [13]:
# from tensorflow.keras.models import load_model
# load_model(models_path+r"\D_WHSP.keras")

In [None]:
D_NAM