In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
# os.environ['CUDA_LAUNCH_BLOCKING']='1'
import random
import torchaudio
from torchaudio import transforms

from torchsummary import summary
import gc
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score


from torchvision import models
from tqdm import tqdm

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

config = {
    'epochs': 150,
    'batch_size' : 32,
    'context' : 48,
    'learning_rate' : 0.01,
    'architecture' : 'very-low-cutoff'
}

## Dataloader

In [3]:
class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, data_path, am_path, gender = "female", phoneme_idx = 4, am_idx = 1, MAX_LEN = 128, partition = "train"):
        """
        :param data_path: the root path of phonemes
        :param am_path: the path of am (.csv)
        :param gender: female or male
        :param phoneme_idx: the phoneme index
        :param am_idx: the index of target AM, should be int within [1, 96]
        :param MAX_LEN: max length of voice seq, if less, pad, if more, slice
        :param partition: train / val1 / val2 / test
        """

        self.MAX_LEN = MAX_LEN
        # get phoneme list
        self.target_phoneme_path = "/".join([data_path, gender, str(int(phoneme_idx))])
        phoneme_list = sorted(os.listdir(self.target_phoneme_path))
        random.shuffle(phoneme_list)
        length = len(phoneme_list)
        if partition == "train":
            self.phoneme_list = phoneme_list[:int(0.7 * length)]
        elif partition == "val1":
            self.phoneme_list = phoneme_list[int(0.7 * length):int(0.8 * length)]
        elif partition == "val2":
            self.phoneme_list = phoneme_list[int(0.8 * length):int(0.9 * length)]
        elif partition == "test":
            self.phoneme_list = phoneme_list[int(0.9 * length):]
            
        # if partition == "train":
        #     self.phoneme_list = phoneme_list[:int(0.7 * length)]
        # elif partition == "val1":
        #     self.phoneme_list = phoneme_list[int(0.7 * length):]


        self.length = len(self.phoneme_list)

        # get_am data
        am_data = pd.read_csv(am_path)
        self.am_data = am_data[["ID", str(am_idx)]]

    def __len__(self):
        return self.length

    def spectro_gram(self, sig, n_mels=64, n_fft=1024, hop_len=None):
        top_db = 80

        # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
        spec = transforms.MelSpectrogram(44100, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

        # Convert to decibels
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        return spec

    def __getitem__(self, ind):
        item_filename = self.phoneme_list[ind]
        item_full_path = "/".join([self.target_phoneme_path, item_filename])
        phoneme = np.load(item_full_path)

        person_id = int(item_filename.split("_")[0][1:7])
        try:
            target_am = self.am_data[self.am_data["ID"] == person_id].values[0][-1]
        except:
            print("person id =", person_id)
            target_am = 0.

        # padding
        phoneme = torch.tensor(phoneme, dtype=torch.float) #.reshape(1, -1)
        # apply mel transform
        phoneme = self.spectro_gram(phoneme)

        std, mean = torch.std_mean(phoneme, unbiased=False, dim=0)
        phoneme = (phoneme - mean) / (std + 1e-6)

        if len(phoneme[0]) < MAX_LEN:
            phoneme = np.pad(phoneme, ((0, 0), (0, MAX_LEN - len(phoneme[0]))), 'constant', constant_values=(0, 0))
            phoneme = torch.from_numpy(phoneme)
        else:
            phoneme = phoneme[:, :MAX_LEN]
        # phoneme = torch.from_numpy(phoneme)
        ##################################################################
        phoneme.unsqueeze_(0)
        ##################################################################
        target_am = torch.tensor(target_am).to(torch.float32)
        
        return phoneme, target_am


In [4]:
# class AudioDataset(torch.utils.data.Dataset):

#     def __init__(self, data_path, am_path, gender = "female", phoneme_idx = 4, am_idx = 1, MAX_LEN = 44100 * 2, partition = "train"):
#         """
#         :param data_path: the root path of phonemes
#         :param am_path: the path of am (.csv)
#         :param gender: female or male
#         :param phoneme_idx: the phoneme index
#         :param am_idx: the index of target AM, should be int within [1, 96]
#         :param MAX_LEN: max length of voice seq, if less, pad, if more, slice
#         :param partition: train / val1 / val2 / test
#         """

#         self.MAX_LEN = MAX_LEN
#         # get phoneme list
#         self.target_phoneme_path = "/".join([data_path, gender, str(int(phoneme_idx))])
#         phoneme_list = sorted(os.listdir(self.target_phoneme_path))
#         length = len(phoneme_list)
#         if partition == "train":
#             self.phoneme_list = phoneme_list[:int(0.7 * length)]
#         elif partition == "val1":
#             self.phoneme_list = phoneme_list[int(0.7 * length):int(0.8 * length)]
#         elif partition == "val2":
#             self.phoneme_list = phoneme_list[int(0.8 * length):int(0.9 * length)]
#         elif partition == "test":
#             self.phoneme_list = phoneme_list[int(0.9 * length):]

#         self.length = len(self.phoneme_list)

#         # get_am data
#         am_data = pd.read_csv(am_path)
#         self.am_data = am_data[["ID", str(am_idx)]]

#     def __len__(self):
#         return self.length

#     def spectro_gram(self, sig, n_mels=64, n_fft=1024, hop_len=None):
#         top_db = 80

#         # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
#         spec = transforms.MelSpectrogram(44100, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

#         # Convert to decibels
#         spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
#         return spec

#     def padding(self, phoneme):
#         if len(phoneme) < self.MAX_LEN:
#             pad_begin_len = random.randint(0, self.MAX_LEN - len(phoneme))
#             pad_end_len = self.MAX_LEN - len(phoneme) - pad_begin_len

#             # Pad with 0s
#             pad_begin = np.zeros(pad_begin_len)
#             pad_end = np.zeros(pad_end_len)

#             phoneme = np.concatenate((pad_begin, phoneme, pad_end), 0)
#         else:
#             phoneme = phoneme[:self.MAX_LEN]
#         return phoneme

#     def __getitem__(self, ind):
#         item_filename = self.phoneme_list[ind]
#         item_full_path = "/".join([self.target_phoneme_path, item_filename])
#         phoneme = np.load(item_full_path)

#         person_id = int(item_filename.split("_")[0][1:7])
#         try:
#             target_am = self.am_data[self.am_data["ID"] == person_id].values[0][-1]
#         except:
#             print("person id =", person_id)
#             target_am = 0.

#         # padding
#         phoneme = self.padding(phoneme)
#         phoneme = torch.tensor(phoneme, dtype=torch.float) #.reshape(1, -1)
#         # apply mel transform
#         phoneme = self.spectro_gram(phoneme)
        
#         ################################### Normalization ######################################
#         std, mean = torch.std_mean(phoneme, unbiased=False, dim=0)
#         phoneme = (phoneme - mean) / (std + 1e-6)
#         # print(phoneme)
#         # ####################### convert phoneme from float32 to float64 ##################
#         # phoneme = phoneme.to(torch.float64)
#         # ##################################################################################

#         target_am = torch.tensor(target_am)
        
        
#         ####################################################################################
#         target_am = target_am.to(torch.float32)
#         # print(target_am)
#         ####################################################################################
        
#         # jia yi ge gui yi hua (phoneme)
        
#         return phoneme, target_am

In [5]:
# default_root_path = "./penstate_data/extract_phoneme"
default_root_path = "./penstate_data/extract_phoneme_processed"

# am_path = "./penstate_data/AMs_unnormalized.csv"
am_path = "./penstate_data/AMs_final.csv"

############## Female ##################
gender = "female"
phoneme_idx = 10
am_idx = 89

# gender = "female"
# phoneme_idx = 10
# am_idx = 13

# gender = "female"
# phoneme_idx = 10
# am_idx = 42

# gender = "female"
# phoneme_idx = 10
# am_idx = 7

############## Male ##################
# gender = "male"
# phoneme_idx = 10
# am_idx = 89

# gender = "male"
# phoneme_idx = 10
# am_idx = 51

# gender = "male"
# phoneme_idx = 10
# am_idx = 4

# gender = "male"
# phoneme_idx = 10
# am_idx = 64


In [None]:
# female am_idx: 89 13 88 51 14
# phoneme_idx: # 7 (ə) 4 (n) 31 (r) 17 (I)               6 (t)

In [6]:
MAX_LEN = 32 # TODO: may be too small
batch_size = 64
batch_size = config['batch_size']
train_data = AudioDataset(data_path=default_root_path,
                            am_path = am_path,
                            gender = gender, phoneme_idx = phoneme_idx, am_idx = am_idx, MAX_LEN = MAX_LEN, partition="train")

######################################################################################################################################
val_data = AudioDataset(data_path=default_root_path,
                            am_path = am_path,
                            gender = gender, phoneme_idx = phoneme_idx, am_idx = am_idx, MAX_LEN = MAX_LEN, partition="val1")
test_data = AudioDataset(data_path=default_root_path,
                            am_path = am_path,
                            gender = gender, phoneme_idx = phoneme_idx, am_idx = am_idx, MAX_LEN = MAX_LEN, partition="val1")
######################################################################################################################################

train_loader = torch.utils.data.DataLoader(train_data, num_workers=0,
                                               batch_size=batch_size, shuffle=True)

######################################################################################################################################
val_loader = torch.utils.data.DataLoader(val_data, num_workers=0,
                                               batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_data, num_workers=0,
                                               batch_size=batch_size)
######################################################################################################################################

print("Batch size: ", config['batch_size'])

print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size:  32
Train dataset samples = 3067, batches = 96
Validation dataset samples = 438, batches = 14
Test dataset samples = 438, batches = 14


In [7]:
all_am = None
for i, data in enumerate(train_loader):
    phoneme, target_am = data
    # sns.heatmap(phoneme[0], cmap="rainbow")
    # plt.show()
    if all_am is None:
        all_am = target_am
    else:
        all_am = torch.cat([all_am, target_am])
    # print(phoneme.shape, target_am.shape)
    # break
with open(gender + "_am.txt", "a+") as f:
    f.write(f'{phoneme_idx},{am_idx},{all_am.mean().item()}\n')
print(f'{phoneme_idx},{am_idx},{all_am.mean().item()}\n')

10,89,-0.12617458403110504



In [8]:
print("Batch size: ", batch_size)
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))

# for i, data in enumerate(train_loader):
#     phoneme, target_am = data
#     print(phoneme.shape, target_am.shape)
#     ##########################################
#     # print(phoneme.dtype, target_am.dtype)
#     ##########################################
#     # break

Batch size:  32
Train dataset samples = 3067, batches = 96


## Model

## Model 8: MnasNet

##### MAE=0.68 ok???

In [9]:
model = models.mnasnet1_0(weights=None).to(device)
# print(model)
model.layers[0] = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
model.classifier[1] = nn.Linear(in_features=1280, out_features=1, bias=True)
print(model)

MNASNet(
  (layers): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.00029999999999996696, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.00029999999999996696, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (7): BatchNorm2d(16, eps=1e-05, momentum=0.00029999999999996696, affine=True, track_running_stats=True)
    (8): Sequential(
      (0): _InvertedResidual(
        (layers): Sequential(
          (0): Conv2d(16, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(48, eps=1e-05, momentum=0.00029999999999996696, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
          (3): Conv2d(48, 

In [10]:
model = model.to(device)
phoneme, AM = next(iter(train_loader))
# # summary(model,(64, 259)) # After conv: torch.Size([2, 128, 5, 18])
# summary(model, phoneme.to(device))

## Model 9: Wide ResNet

In [11]:
# model = models.mnasnet1_0(weights=None).to(device)
# # print(model)
# model.layers[0] = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
# model.classifier[1] = nn.Linear(in_features=1280, out_features=1, bias=True)
# # print(model)


In [12]:
# model = model.to(device)
# phoneme, AM = next(iter(train_loader))
# # # summary(model,(64, 259)) # After conv: torch.Size([2, 128, 5, 18])
# # summary(model, phoneme.to(device))

# Train and eval

In [13]:
torch.cuda.empty_cache()
gc.collect()

33

In [14]:
criterion = torch.nn.MSELoss() #Defining Loss function 
optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate']) #Defining Optimizer
# optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=0.0001, last_epoch=-1)
# scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[35,40,45,50,60,65,70,90,110,150,170,180], gamma=0.5) # add learning rate scheduler

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(len(train_loader) * config['epochs']))

In [15]:
def train(model, optimizer, criterion, dataloader):

    model.train()
    train_loss = 0.0 #Monitoring Loss
    
    #########################################################
    # AM_true_list = []
    # AM_pred_list = []
    #########################################################
    
    for iter, (phoneme, AM) in enumerate(dataloader):
        scheduler.step()
        ### Move Data to Device (Ideally GPU)
        phoneme = phoneme.to(device)
        AM = AM.to(device)

        ### Forward Propagation
        preds_AM = model(phoneme)

        ### Loss Calculation
        # print(AM.shape)
        preds_AM = torch.squeeze(preds_AM)
        # print(preds_AM)
        # print(preds_AM.shape)model = models.shufflenet_v2_x1_0(weights=None).to(device)
        loss = criterion(preds_AM, AM)
        train_loss += loss.item()
        
        #########################################################
        ### Store Pred and True Labels
        # AM_pred_list.extend(preds_AM.cpu().tolist())
        # AM_true_list.extend(AM.cpu().tolist())
        #########################################################

        ### Initialize Gradients
        optimizer.zero_grad()

        ### Backward Propagation
        loss.backward()

        ### Gradient Descent
        optimizer.step()
        # if iter % 20 == 0:
        #     print("iter =", iter, "loss =",loss.item())
    train_loss /= len(dataloader)
    print("Learning rate = ", scheduler.get_last_lr()[0])
    print("Train loss = ", train_loss)
    
    #########################################################
    # print(AM_pred_list)
    # print(AM_true_list)
    # print(len(AM_pred_list))
    # print(len(AM_true_list))
    # accuracy = mean_squared_error(AM_pred_list, AM_true_list)
    # print("Train MSE accuracy: ", accuracy)
    #########################################################
    
    # scheduler.step() # add schedule learning rate
    return train_loss

In [16]:
def eval(model, dataloader):

    model.eval() # set model in evaluation mode

    AM_true_list = []
    AM_pred_list = []

    for i, data in enumerate(dataloader):

        phoneme, AM = data
        ### Move data to device (ideally GPU)
        phoneme, AM = phoneme.to(device), AM.to(device) 

        with torch.inference_mode(): # makes sure that there are no gradients computed as we are not training the model now
            ### Forward Propagation
            ### Get Predictions
            predicted_AM = model(phoneme)
            # print(predicted_AM)
        
        ### Store Pred and True Labels
        AM_pred_list.extend(predicted_AM.cpu().tolist())
        AM_true_list.extend(AM.cpu().tolist())
        
        # Do you think we need loss.backward() and optimizer.step() here?
    
        del phoneme, AM, predicted_AM
        torch.cuda.empty_cache()

    ###############################################################################################
    # print(AM_pred_list[1000:3100])
    # print(AM_true_list)
    # print(len(AM_pred_list))
    # print(len(AM_true_list))
    ###############################################################################################
    
    # print("Number of equals between two list: ", sum(a == b for a,b in zip(AM_pred_list, AM_true_list)))
    
    ### Calculate Accuracy
    MSE = mean_squared_error(AM_pred_list, AM_true_list)
    r2_score_acc = r2_score(AM_pred_list, AM_true_list)
    MAE = mean_absolute_error(AM_pred_list, AM_true_list)
    print("Validation r2_score: ", r2_score_acc)
    print("Validation MAE: ", MAE)
    
    return MSE

# Experiment

In [17]:
# Iterate over number of epochs to train and evaluate your model
torch.cuda.empty_cache()

best_mse = 1.0 ### Monitor best accuracy in your run

for epoch in range(config['epochs']):
    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    train_loss = train(model, optimizer, criterion, train_loader)
    MSE = eval(model, val_loader)

    print("\tTrain Loss: ", train_loss)
    print("\tValidation MSE: ", MSE)

    ### Save checkpoint if accuracy is better than your current best
    if MSE < best_mse:
        best_mse = MSE
    ### Save checkpoint with information you want
        torch.save({'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'loss': train_loss,
              'learning rate': scheduler.get_last_lr()[0],
              'mse': MSE}, 
        './model_checkpoint.pth')


Epoch 1/150




Learning rate =  0.00999890341737423
Train loss =  2.114860268930594
Validation r2_score:  -28402565.0283373
Validation MAE:  0.879159502259277
	Train Loss:  2.114860268930594
	Validation MSE:  1.1896642361967569

Epoch 2/150
Learning rate =  0.0099956141504943
Train loss =  1.015468344092369
Validation r2_score:  -11551319533.334398
Validation MAE:  0.7816657140403434
	Train Loss:  1.015468344092369
	Validation MSE:  0.9764082295172313

Epoch 3/150
Learning rate =  0.009990133642141364
Train loss =  1.0059166469921668
Validation r2_score:  -538330983293.9275
Validation MAE:  0.7827127919140054
	Train Loss:  1.0059166469921668
	Validation MSE:  0.9791264501167312

Epoch 4/150
Learning rate =  0.009982464296247528
Train loss =  1.0306617757305503
Validation r2_score:  -621402379742.7112
Validation MAE:  0.7823639948034795
	Train Loss:  1.0306617757305503
	Validation MSE:  0.978065718202774

Epoch 5/150
Learning rate =  0.009972609476841367
Train loss =  0.9646691760669152
Validation r2_

# Test

In [18]:
def test(model, test_loader):
  ### What you call for model to perform inference?
    model.eval()

  ### List to store predicted phonemes of test data
    test_predictions = []
    ground_truth = []

  ### Which mode do you need to avoid gradients?
    with torch.inference_mode():

        for i, data in enumerate(tqdm(test_loader)):

            phoneme, groundtruth_AM = data
            ### Move data to device (ideally GPU)
            phoneme, groundtruth_AM = phoneme.to(device), groundtruth_AM.to(device)         
          
            predicted_AM = model(phoneme)
            predicted_AM.squeeze_()
            # print(predicted_AM.shape)
            # print(groundtruth_AM.shape)

          ### How do you store predicted_phonemes with test_predictions? Hint, look at eval 
            test_predictions.extend(predicted_AM.cpu().tolist())
            ground_truth.extend(groundtruth_AM.cpu())
    
    # print(len(test_predictions))
    return test_predictions, ground_truth

In [19]:
predictions, ground_truth = test(model, test_loader)

100%|███████████████████████████████████████████| 14/14 [00:00<00:00, 31.17it/s]


In [21]:
import time

In [20]:
### Create CSV file with predictions
if gender == "female":
    g_flag = "F"
else:
    g_flag = "M"
    
with open("./%s_"%g_flag + "phoneme%s"%phoneme_idx +  "_AM%s" + ".csv"%am_idx, "w+") as f:
    f.write("person, label, prediction\n")
    for i in range(len(predictions)):
        f.write("{},{},{}\n".format(i, ground_truth[i], predictions[i]))

## 