In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
# os.environ['CUDA_LAUNCH_BLOCKING']='1'
import random
import torchaudio
from torchaudio import transforms

from torchsummary import summary
import gc
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score


from torchvision import models
from tqdm import tqdm

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

config = {
    'epochs': 150,
    'batch_size' : 8,
    'context' : 48,
    'learning_rate' : 0.001,
    'architecture' : 'very-low-cutoff'
}

# preprocessing

# padding

In [3]:
temp = torch.zeros((2,64,5002))
MAX_test = 10000

In [4]:
temp_test = np.pad(temp, ((0, 0),(0,0), (0, MAX_test - temp.shape[2])), 'symmetric')

In [5]:
temp_test[:,:,:10].shape

(2, 64, 10)

In [6]:
random.randint(0,0)

0

## Dataloader

In [7]:
class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, data_path, am_path, gender = "Female_processed", am_idx = 1, MAX_LEN = 128, partition = "train"):
        """
        :param data_path: the root path of phonemes
        :param am_path: the path of am (.csv)
        :param gender: female or male
        :param phoneme_idx: the phoneme index
        :param am_idx: the index of target AM, should be int within [1, 96]
        :param MAX_LEN: max length of voice seq, if less, pad, if more, slice
        :param partition: train / val1 / val2 / test
        """

        self.MAX_LEN = MAX_LEN
        # get phoneme list
        self.target_voice_path = "/".join([data_path, gender])
        voice_list = sorted(os.listdir(self.target_voice_path))
        random.shuffle(voice_list)
        length = len(voice_list)
        if partition == "train":
            self.voice_list = voice_list[:int(0.7 * length)]
        elif partition == "val1":
            self.voice_list = voice_list[int(0.7 * length):int(0.8 * length)]
        elif partition == "val2":
            self.voice_list = voice_list[int(0.8 * length):int(0.9 * length)]
        elif partition == "test":
            self.voice_list = voice_list[int(0.9 * length):]
            
        # if partition == "train":
        #     self.phoneme_list = phoneme_list[:int(0.7 * length)]
        # elif partition == "val1":
        #     self.phoneme_list = phoneme_list[int(0.7 * length):]


        self.length = len(self.voice_list)

        # get_am data
        am_data = pd.read_csv(am_path)
        self.am_data = am_data[["ID", str(am_idx)]]

    def __len__(self):
        return self.length

    def spectro_gram(self, sig, rate_of_sample=44100, n_mels=64, n_fft=512, hop_len=None):
        # top_db = 80

        # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
        spec = transforms.MelSpectrogram(
                sample_rate=rate_of_sample, n_fft=n_fft,
                win_length=400, hop_length=160, n_mels=n_mels)(sig)
        
        # Convert to decibels
        # spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        return spec

    def __getitem__(self, ind):
        item_filename = self.voice_list[ind]
        item_full_path = "/".join([self.target_voice_path, item_filename])
        
        data_waveform, rate_of_sample = torchaudio.load(item_full_path)
        # voice = np.load(item_full_path)

        person_id = int(item_filename[1:7])
        try:
            target_am = self.am_data[self.am_data["ID"] == person_id].values[0][-1]
        except:
            print("person id =", person_id)
            target_am = 0.

        # padding
        data_waveform = torch.tensor(data_waveform, dtype=torch.float) #.reshape(1, -1)
        # apply mel transform
        data_waveform = self.spectro_gram(data_waveform, rate_of_sample)

        std, mean = torch.std_mean(data_waveform, unbiased=False, dim=0)
        data_waveform = (data_waveform - mean) / (std + 1e-6)
        # print(data_waveform.shape)
        if data_waveform.shape[2] < MAX_LEN:
            # data_waveform = np.pad(data_waveform, ((0, 0),(0,0), (0, MAX_LEN - data_waveform.shape[2])), 'symmetric'), 'constant', constant_values=(0, 0)
            data_waveform = np.pad(data_waveform, ((0, 0),(0,0), (0, MAX_LEN - data_waveform.shape[2])), 'constant', constant_values=(0, 0))
            
            data_waveform = torch.from_numpy(data_waveform)
        else:
            temp_start = random.randint(0, data_waveform.shape[2] - MAX_LEN)
            data_waveform = data_waveform[:,:,temp_start:temp_start + MAX_LEN]
        # print(data_waveform.shape)
        # phoneme = torch.from_numpy(phoneme)
        ##################################################################
        # data_waveform.unsqueeze_(0)
        ##################################################################
        target_am = torch.tensor(target_am).to(torch.float32)
        
        return data_waveform, target_am


In [8]:
# class AudioDataset(torch.utils.data.Dataset):

#     def __init__(self, data_path, am_path, gender = "female", phoneme_idx = 4, am_idx = 1, MAX_LEN = 44100 * 2, partition = "train"):
#         """
#         :param data_path: the root path of phonemes
#         :param am_path: the path of am (.csv)
#         :param gender: female or male
#         :param phoneme_idx: the phoneme index
#         :param am_idx: the index of target AM, should be int within [1, 96]
#         :param MAX_LEN: max length of voice seq, if less, pad, if more, slice
#         :param partition: train / val1 / val2 / test
#         """

#         self.MAX_LEN = MAX_LEN
#         # get phoneme list
#         self.target_phoneme_path = "/".join([data_path, gender, str(int(phoneme_idx))])
#         phoneme_list = sorted(os.listdir(self.target_phoneme_path))
#         length = len(phoneme_list)
#         if partition == "train":
#             self.phoneme_list = phoneme_list[:int(0.7 * length)]
#         elif partition == "val1":
#             self.phoneme_list = phoneme_list[int(0.7 * length):int(0.8 * length)]
#         elif partition == "val2":
#             self.phoneme_list = phoneme_list[int(0.8 * length):int(0.9 * length)]
#         elif partition == "test":
#             self.phoneme_list = phoneme_list[int(0.9 * length):]

#         self.length = len(self.phoneme_list)

#         # get_am data
#         am_data = pd.read_csv(am_path)
#         self.am_data = am_data[["ID", str(am_idx)]]

#     def __len__(self):
#         return self.length

#     def spectro_gram(self, sig, n_mels=64, n_fft=1024, hop_len=None):
#         top_db = 80

#         # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
#         spec = transforms.MelSpectrogram(44100, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

#         # Convert to decibels
#         spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
#         return spec

#     def padding(self, phoneme):
#         if len(phoneme) < self.MAX_LEN:
#             pad_begin_len = random.randint(0, self.MAX_LEN - len(phoneme))
#             pad_end_len = self.MAX_LEN - len(phoneme) - pad_begin_len

#             # Pad with 0s
#             pad_begin = np.zeros(pad_begin_len)
#             pad_end = np.zeros(pad_end_len)

#             phoneme = np.concatenate((pad_begin, phoneme, pad_end), 0)
#         else:
#             phoneme = phoneme[:self.MAX_LEN]
#         return phoneme

#     def __getitem__(self, ind):
#         item_filename = self.phoneme_list[ind]
#         item_full_path = "/".join([self.target_phoneme_path, item_filename])
#         phoneme = np.load(item_full_path)

#         person_id = int(item_filename.split("_")[0][1:7])
#         try:
#             target_am = self.am_data[self.am_data["ID"] == person_id].values[0][-1]
#         except:
#             print("person id =", person_id)
#             target_am = 0.

#         # padding
#         phoneme = self.padding(phoneme)
#         phoneme = torch.tensor(phoneme, dtype=torch.float) #.reshape(1, -1)
#         # apply mel transform
#         phoneme = self.spectro_gram(phoneme)
        
#         ################################### Normalization ######################################
#         std, mean = torch.std_mean(phoneme, unbiased=False, dim=0)
#         phoneme = (phoneme - mean) / (std + 1e-6)
#         # print(phoneme)
#         # ####################### convert phoneme from float32 to float64 ##################
#         # phoneme = phoneme.to(torch.float64)
#         # ##################################################################################

#         target_am = torch.tensor(target_am)
        
        
#         ####################################################################################
#         target_am = target_am.to(torch.float32)
#         # print(target_am)
#         ####################################################################################
        
#         # jia yi ge gui yi hua (phoneme)
        
#         return phoneme, target_am

In [9]:
# default_root_path = "./penstate_data/extract_phoneme"
default_root_path = "./penstate_data/download/Full_voice_files"

# am_path = "./penstate_data/AMs_unnormalized.csv"
am_path = "./penstate_data/AMs_final.csv"

############## Female ##################
gender = "Female_processed" # Male_processed
am_idx = 89

# gender = "female"
# phoneme_idx = 10
# am_idx = 13

# gender = "female"
# phoneme_idx = 10
# am_idx = 42

# gender = "female"
# phoneme_idx = 10
# am_idx = 7

############## Male ##################
# gender = "male"
# phoneme_idx = 10
# am_idx = 89

# gender = "male"
# phoneme_idx = 10
# am_idx = 51

# gender = "male"
# phoneme_idx = 10
# am_idx = 4

# gender = "male"
# phoneme_idx = 10
# am_idx = 64


In [10]:
MAX_LEN = 4096 # TODO: may be too small
batch_size = config['batch_size']
# batch_size = 4
train_data = AudioDataset(data_path=default_root_path,
                            am_path = am_path,
                            gender = gender, am_idx = am_idx, MAX_LEN = MAX_LEN, partition="train")

######################################################################################################################################
val_data = AudioDataset(data_path=default_root_path,
                            am_path = am_path,
                            gender = gender, am_idx = am_idx, MAX_LEN = MAX_LEN, partition="val1")
test_data = AudioDataset(data_path=default_root_path,
                            am_path = am_path,
                            gender = gender, am_idx = am_idx, MAX_LEN = MAX_LEN, partition="val1")
######################################################################################################################################

train_loader = torch.utils.data.DataLoader(train_data, num_workers=0,
                                               batch_size=batch_size, shuffle=True)

######################################################################################################################################
val_loader = torch.utils.data.DataLoader(val_data, num_workers=0,
                                               batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_data, num_workers=0,
                                               batch_size=batch_size)
######################################################################################################################################

print("Batch size: ", config['batch_size'])

print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size:  8
Train dataset samples = 482, batches = 61
Validation dataset samples = 70, batches = 9
Test dataset samples = 70, batches = 9


In [11]:
print("Batch size: ", batch_size)
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
for i, data in enumerate(train_loader):
    voice, target_am = data
    print("voice shape =",voice.shape, "target_am shape =",target_am.shape)
    break
# for i, data in enumerate(train_loader):
#     phoneme, target_am = data
#     print(phoneme.shape, target_am.shape)
#     ##########################################
#     # print(phoneme.dtype, target_am.dtype)
#     ##########################################
#     # break

Batch size:  8
Train dataset samples = 482, batches = 61


  data_waveform = torch.tensor(data_waveform, dtype=torch.float) #.reshape(1, -1)


voice shape = torch.Size([8, 2, 64, 4096]) target_am shape = torch.Size([8])


## Model

## Model 8: MnasNet

##### MAE=0.68 ok???

In [12]:
# model = models.mnasnet1_0(weights=None).to(device)
# # print(model)
# model.layers[0] = nn.Conv2d(2, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
# model.classifier[1] = nn.Linear(in_features=1280, out_features=1, bias=True)
# # print(model)

In [13]:
# model = model.to(device)
# phoneme, AM = next(iter(train_loader))
# # # summary(model,(64, 259)) # After conv: torch.Size([2, 128, 5, 18])
# summary(model, phoneme.to(device))

  data_waveform = torch.tensor(data_waveform, dtype=torch.float) #.reshape(1, -1)


Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 1280, 2, 128]        --
|    └─Conv2d: 2-1                       [-1, 32, 32, 2048]        576
|    └─BatchNorm2d: 2-2                  [-1, 32, 32, 2048]        64
|    └─ReLU: 2-3                         [-1, 32, 32, 2048]        --
|    └─Conv2d: 2-4                       [-1, 32, 32, 2048]        288
|    └─BatchNorm2d: 2-5                  [-1, 32, 32, 2048]        64
|    └─ReLU: 2-6                         [-1, 32, 32, 2048]        --
|    └─Conv2d: 2-7                       [-1, 16, 32, 2048]        512
|    └─BatchNorm2d: 2-8                  [-1, 16, 32, 2048]        32
|    └─Sequential: 2-9                   [-1, 24, 16, 1024]        --
|    |    └─_InvertedResidual: 3-1       [-1, 24, 16, 1024]        2,592
|    |    └─_InvertedResidual: 3-2       [-1, 24, 16, 1024]        4,440
|    |    └─_InvertedResidual: 3-3       [-1, 24, 16, 1024]        4,440
|  

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 1280, 2, 128]        --
|    └─Conv2d: 2-1                       [-1, 32, 32, 2048]        576
|    └─BatchNorm2d: 2-2                  [-1, 32, 32, 2048]        64
|    └─ReLU: 2-3                         [-1, 32, 32, 2048]        --
|    └─Conv2d: 2-4                       [-1, 32, 32, 2048]        288
|    └─BatchNorm2d: 2-5                  [-1, 32, 32, 2048]        64
|    └─ReLU: 2-6                         [-1, 32, 32, 2048]        --
|    └─Conv2d: 2-7                       [-1, 16, 32, 2048]        512
|    └─BatchNorm2d: 2-8                  [-1, 16, 32, 2048]        32
|    └─Sequential: 2-9                   [-1, 24, 16, 1024]        --
|    |    └─_InvertedResidual: 3-1       [-1, 24, 16, 1024]        2,592
|    |    └─_InvertedResidual: 3-2       [-1, 24, 16, 1024]        4,440
|    |    └─_InvertedResidual: 3-3       [-1, 24, 16, 1024]        4,440
|  

## Model 9: Wide ResNet

In [14]:
# model = models.mnasnet1_0(weights=None).to(device)
# # print(model)
# model.layers[0] = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
# model.classifier[1] = nn.Linear(in_features=1280, out_features=1, bias=True)
# # print(model)


In [15]:
# model = model.to(device)
# phoneme, AM = next(iter(train_loader))
# # # summary(model,(64, 259)) # After conv: torch.Size([2, 128, 5, 18])
# # summary(model, phoneme.to(device))

## Model 10: VGG

In [16]:
model = models.vgg16(weights=None).to(device)
# print(model)
model.features[0] = nn.Conv2d(2, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
model.classifier.append(nn.ReLU(inplace=True))
model.classifier.append(nn.Dropout(p=0.5, inplace=False))
model.classifier.append(nn.Linear(in_features=1000, out_features=1, bias=True))

print(model)


In [None]:
model = model.to(device)
# phoneme, AM = next(iter(train_loader))
# # # summary(model,(64, 259)) # After conv: torch.Size([2, 128, 5, 18])
# summary(model, phoneme.to(device))

# Train and eval

In [17]:
torch.cuda.empty_cache()
gc.collect()

136

In [18]:
criterion = torch.nn.MSELoss() #Defining Loss function 
optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate']) #Defining Optimizer
# optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=0.0001, last_epoch=-1)
# scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[35,40,45,50,60,65,70,90,110,150,170,180], gamma=0.5) # add learning rate scheduler

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(len(train_loader) * config['epochs']))

In [19]:
def train(model, optimizer, criterion, dataloader):

    model.train()
    train_loss = 0.0 #Monitoring Loss
    
    #########################################################
    # AM_true_list = []
    # AM_pred_list = []
    #########################################################
    
    for iter, (phoneme, AM) in enumerate(dataloader):
        scheduler.step()
        ### Move Data to Device (Ideally GPU)
        phoneme = phoneme.to(device)
        AM = AM.to(device)

        ### Forward Propagation
        preds_AM = model(phoneme)

        ### Loss Calculation
        # print(AM.shape)
        preds_AM = torch.squeeze(preds_AM)
        # print(preds_AM)
        # print(preds_AM.shape)model = models.shufflenet_v2_x1_0(weights=None).to(device)
        loss = criterion(preds_AM, AM)
        train_loss += loss.item()
        
        #########################################################
        ### Store Pred and True Labels
        # AM_pred_list.extend(preds_AM.cpu().tolist())
        # AM_true_list.extend(AM.cpu().tolist())
        #########################################################

        ### Initialize Gradients
        optimizer.zero_grad()

        ### Backward Propagation
        loss.backward()

        ### Gradient Descent
        optimizer.step()
        # if iter % 20 == 0:
        #     print("iter =", iter, "loss =",loss.item())
    train_loss /= len(dataloader)
    print("Learning rate = ", scheduler.get_last_lr()[0])
    print("Train loss = ", train_loss)
    
    #########################################################
    # print(AM_pred_list)
    # print(AM_true_list)
    # print(len(AM_pred_list))
    # print(len(AM_true_list))
    # accuracy = mean_squared_error(AM_pred_list, AM_true_list)
    # print("Train MSE accuracy: ", accuracy)
    #########################################################
    
    # scheduler.step() # add schedule learning rate
    return train_loss

In [20]:
def eval(model, dataloader):

    model.eval() # set model in evaluation mode

    AM_true_list = []
    AM_pred_list = []

    for i, data in enumerate(dataloader):

        phoneme, AM = data
        ### Move data to device (ideally GPU)
        phoneme, AM = phoneme.to(device), AM.to(device) 

        with torch.inference_mode(): # makes sure that there are no gradients computed as we are not training the model now
            ### Forward Propagation
            ### Get Predictions
            predicted_AM = model(phoneme)
            # print(predicted_AM)
        
        ### Store Pred and True Labels
        AM_pred_list.extend(predicted_AM.cpu().tolist())
        AM_true_list.extend(AM.cpu().tolist())
        
        # Do you think we need loss.backward() and optimizer.step() here?
    
        del phoneme, AM, predicted_AM
        torch.cuda.empty_cache()

    ###############################################################################################
    # print(AM_pred_list[1000:3100])
    # print(AM_true_list)
    # print(len(AM_pred_list))
    # print(len(AM_true_list))
    ###############################################################################################
    
    # print("Number of equals between two list: ", sum(a == b for a,b in zip(AM_pred_list, AM_true_list)))
    
    ### Calculate Accuracy
    MSE = mean_squared_error(AM_pred_list, AM_true_list)
    r2_score_acc = r2_score(AM_pred_list, AM_true_list)
    MAE = mean_absolute_error(AM_pred_list, AM_true_list)
    print("Validation r2_score: ", r2_score_acc)
    print("Validation MAE: ", MAE)
    
    return MSE

# Experiment

In [None]:
# Iterate over number of epochs to train and evaluate your model
torch.cuda.empty_cache()

best_mse = 1.0 ### Monitor best accuracy in your run

for epoch in range(config['epochs']):
    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    train_loss = train(model, optimizer, criterion, train_loader)
    MSE = eval(model, val_loader)

    print("\tTrain Loss: ", train_loss)
    print("\tValidation MSE: ", MSE)

    ### Save checkpoint if accuracy is better than your current best
    if MSE < best_mse:
        best_mse = MSE
    ### Save checkpoint with information you want
        torch.save({'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'loss': train_loss,
              'learning rate': scheduler.get_last_lr()[0],
              'mse': MSE}, 
        './model_checkpoint.pth')


Epoch 1/150


  data_waveform = torch.tensor(data_waveform, dtype=torch.float) #.reshape(1, -1)


# Test

In [None]:
def test(model, test_loader):
  ### What you call for model to perform inference?
    model.eval()

  ### List to store predicted phonemes of test data
    test_predictions = []
    ground_truth = []

  ### Which mode do you need to avoid gradients?
    with torch.inference_mode():

        for i, data in enumerate(tqdm(test_loader)):

            phoneme, groundtruth_AM = data
            ### Move data to device (ideally GPU)
            phoneme, groundtruth_AM = phoneme.to(device), groundtruth_AM.to(device)         
          
            predicted_AM = model(phoneme)
            predicted_AM.squeeze_()
            # print(predicted_AM.shape)
            # print(groundtruth_AM.shape)

          ### How do you store predicted_phonemes with test_predictions? Hint, look at eval 
            test_predictions.extend(predicted_AM.cpu().tolist())
            ground_truth.extend(groundtruth_AM.cpu())
    
    # print(len(test_predictions))
    return test_predictions, ground_truth

In [None]:
predictions, ground_truth = test(model, test_loader)

In [None]:
### Create CSV file with predictions
if gender == "female":
    g_flag = "F"
else:
    g_flag = "M"
    
with open("./%s_"%g_flag + "phoneme%s"%phoneme_idx +  "_AM%s.csv"%am_idx, "w+") as f:
    f.write("person, label, prediction\n")
    for i in range(len(predictions)):
        f.write("{},{},{}\n".format(i, ground_truth[i], predictions[i]))

## 