In [17]:
#https://tips-memo.com/python-ae
import os
import sys
import numpy as np
import librosa
import glob

In [19]:
!python --version

Python 2.7.17


In [2]:
os.getcwd()

'/opt/resort/rikako/autoencoder'

In [3]:
from tools import EarlyStopping

In [4]:
def extract_mel(wav, sr, n_mels=64, hop_length=160, n_fft=512): #(timeframe, mel_dim) 
    audio, _ = librosa.load(wav, sr=sr)
    mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels , hop_length=160, n_fft=512).T
    return mel

###
def data_list(path, sr, n_mels=64, hop_length=160, n_fft=512): #620
    wav_list = glob.glob(path)
    size = len(wav_list)
    data = np.ones((1, n_mels))
    count= 0
    for wavname in wav_list:
        component = extract_mel(wavname, sr=sr, n_mels=64, hop_length=160, n_fft=512)
        data = np.concatenate([data, component], axis=0)
        count += 1
        sys.stdout.write("\r%s" % "現在"+str(np.around((count/len(wav_list))*100 , 2))+"%完了")
        sys.stdout.flush()
    return data[1:], size
####
# pathの設定
train_wav_path= "./train/*.wav"
valid_wav_path = "./valid/*.wav"
                                                          

# パラメータの保存先指定
out_audio_dir = "./data/audio/out_audio_dir/"
if not os.path.exists(out_audio_dir):
    os.makedirs(out_audio_dir)


# wavデータの一括読み込み
# 今回はメル周波数スペクトログラムを15500Hz, 620次元取得して全データに関して縦に並べている
print("---read_training_wav---")
train_data_list, size_train = data_list(train_wav_path, sr=16000, hop_length=160, n_mels=64, n_fft=512)
print("\n---read_valid_wav---")
valid_data_list, size_test = data_list(valid_wav_path, sr=16000, hop_length=160, n_mels=64, n_fft=512)
print("\n")

---read_training_wav---
現在100.0%完了
---read_valid_wav---
現在100.0%完了



In [None]:
with open('train_data_list.pickle', mode='wb') as f:
        pickle.dump(train_data_list, f)

    with open('valid_data_list.pickle', mode='wb') as f:
        pickle.dump(valid_data_list, f)

In [6]:
import os
import numpy as np
import torch
import torchvision
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
from torch.optim.lr_scheduler import LambdaLR
#import matplotlib.pyplot as plt

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
print('torch.cuda.is_available() :',torch.cuda.is_available())

torch.cuda.is_available() : True


In [7]:
!nvidia-smi

Sun Dec 13 17:43:14 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 396.54                 Driver Version: 396.54                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 1080    Off  | 00000000:01:00.0  On |                  N/A |
| 33%   41C    P8    11W / 180W |    129MiB /  8110MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [13]:
out_params_dir = "./data/models/params/"
out_figs_dir = "./data/models/images/"
out_texts_dir = "./data/models/texts/"
in_audio_dir = "./data/audio/"

if not os.path.exists(out_params_dir):
    os.makedirs(out_params_dir)
if not os.path.exists(out_figs_dir):
    os.makedirs(out_figs_dir)
if not os.path.exists(out_texts_dir):
    os.makedirs(out_texts_dir)
if not os.path.exists(in_audio_dir):
    os.makedirs(in_audio_dir)

In [14]:
#https://tasotasoso.hatenablog.com/entry/2020/01/12/184130?utm_source=feed #これも参考？
#https://qiita.com/mathlive/items/2a512831878b8018db02 #データセットの作り方
from torch.utils.data import Dataset

# ExpandDatasetは1つの時間フレームに対して前後10フレームを取得するようなデータセット
class ExpandDataset(Dataset):
    def __init__(self, data, transform=None):
        self.transform = transform
        print(self.transform)
        self.data = data
        print(self.data)
        self.data_num = len(data)
        if self.data.ndim==2:
            self.pad_data_fr = data[:10][::-1] #0~9行目までを
            self.pad_data_bc = data[-10:][::-1]
            self.pad_data = np.concatenate([self.pad_data_fr, data, self.pad_data_bc], axis=0)
        elif self.data.ndim==3:
            self.pad_data_fr = data[:,:10,:][:,::-1,:]
            self.pad_data_bc = data[:,-10:,:][:,::-1,:]
            self.pad_data = np.concatenate([self.pad_data_fr, data, self.pad_data_bc], axis=1)    
    def __len__(self):
        return self.data_num
    def __getitem__(self, idx):
        if self.transform:
            if self.data.ndim==2:
                #__getitem__が呼ばれると,idxから20取ってきて，一次元配列に加工するƒ
                out_data = self.transform(self.pad_data)[0][idx:idx+20].flatten()
            elif self.data.ndim==3:
                index = int(random.uniform(0,self.data.shape[1]))
                out_data = self.transform(self.pad_data)[:,idx, index:index+20].flatten()
        else:
            print("transformを使用しテンソル化してください")
        return out_data

In [15]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.dense_enc1 = nn.Linear(1280, 1200)
        self.bn1 = nn.BatchNorm1d(1200)
        self.dense_enc2 = nn.Linear(1200, 1100)
        self.bn2 = nn.BatchNorm1d(1100)
        self.dense_enc3 = nn.Linear(1100,1024)
    
        self.dense_dec1 = nn.Linear(1024,1100)
        self.bn4 = nn.BatchNorm1d(1100)
        self.dense_dec2 = nn.Linear(1100, 1200)
        self.bn5 = nn.BatchNorm1d(1200)
        self.drop1 = nn.Dropout(p=0.2)
        self.dense_dec3 = nn.Linear(1200, 1280)

    def encoder(self, x):
        x = F.relu(self.dense_enc1(x))
        x = self.bn1(x)
        x = F.relu(self.dense_enc2(x))
        x = self.bn2(x)
        x = self.dense_enc3(x)
        return x

    def decoder(self, x):
        x = F.relu(self.dense_dec1(x))
        x = self.bn4(x)
        x = F.relu(self.dense_dec2(x))
        x = self.bn5(x)
        x = self.drop1(x)
        x = self.dense_dec3(x)
        return x

    def forward(self, x):
        z = self.encoder(x)
        x = self.decoder(z)
        return x, z

In [13]:
#データセット作成に必要
transform = transforms.Compose([transforms.ToTensor()])
train_dataset = ExpandDataset(train_data_list, transform)
#print(type(train_dataset))
print(train_data_list.shape) #(1017741, 64)

Compose(
    ToTensor()
)
[[4.50765656e-04 5.30805322e-04 4.66224774e-05 ... 2.60053270e-07
  2.71601010e-07 1.05517472e-07]
 [4.18963784e-04 4.02063539e-04 1.58382143e-04 ... 5.17296712e-07
  3.66128631e-07 8.42486116e-08]
 [1.46318285e-03 4.32728790e-04 2.18650584e-05 ... 3.93063090e-07
  5.57712269e-07 4.71317421e-08]
 ...
 [4.11660585e-04 8.21963331e-05 3.10359537e-05 ... 3.40592948e-07
  2.95967340e-07 4.87355045e-08]
 [5.05660602e-04 1.04563136e-04 2.13599142e-05 ... 3.72386722e-07
  2.76446372e-07 3.13855359e-08]
 [5.35616782e-05 3.46706693e-05 3.10118185e-06 ... 3.72152414e-07
  8.44074250e-08 1.45347610e-08]]
(14959913, 64)


In [14]:
print(valid_data_list.shape) 
valid_dataset = ExpandDataset(valid_data_list, transform)

(3688445, 64)
Compose(
    ToTensor()
)
[[1.12353568e-03 2.32042425e-04 8.63318273e-05 ... 2.80149095e-07
  1.42585620e-07 2.76045746e-08]
 [8.54921702e-04 5.76749793e-04 2.92755321e-05 ... 2.58800924e-07
  1.49010120e-07 3.15013402e-08]
 [5.29345532e-04 1.78880757e-04 5.72159406e-05 ... 1.71984652e-07
  1.28817035e-07 5.22359400e-08]
 ...
 [6.01731881e-04 2.33925020e-04 2.85883125e-05 ... 1.25567317e-06
  7.95226242e-07 5.98488228e-08]
 [1.34566362e-04 2.21855677e-04 5.31124897e-05 ... 7.31991975e-07
  4.55107511e-07 9.62500906e-08]
 [8.93667529e-05 6.47487977e-05 4.50981588e-06 ... 6.42957446e-07
  4.22305192e-07 1.39114945e-07]]


In [15]:
train_batch_size = 100 #25
train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)

In [16]:
valid_batch_size = 100 #25
valid_loader = DataLoader(valid_dataset, batch_size=valid_batch_size, shuffle=True)

In [17]:
num_epochs = 1
learning_rate = 1e-4

model = Autoencoder().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)

In [18]:
def func_100(epoch):
    if epoch <= 100:
        return 1
    elif 100 < epoch:
        return -0.99*(1e-2)*(epoch) + 1.99

In [19]:
scheduler = LambdaLR(optimizer, lr_lambda=func_100)

In [22]:
#https://github.com/Bjarten/early-stopping-pytorch
def train(model,train_loader,criterion,optimizer):
    model.train()
    train_losses = []
    for x in train_loader:
        x = x.to(device)
        model.zero_grad()
        y, z = model(x.float())
        
        loss = criterion(y.float(), x.float())
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
        
    return train_losses

def valid(model, valid_loader, criterion):
    model.eval()
    valid_losses = []
    for x in valid_loader:
        x = x.to(device)
        y, z = model(x.float())
        
        loss = criterion(y.float(), x.float())
        valid_losses.append(loss.item())
    return valid_losses

#def train_model(model, batch_size, train_loader, valid_loader, patience, criterion, optimizer, num_epochs, PATH):
def train_model(model, batch_size, train_loader, valid_loader, patience, criterion, optimizer, num_epochs):
        
    avg_train_losses = []
    avg_valid_losses = []
    
    early_stopping = EarlyStopping(patience = patience, verbose = True)
    
    for epoch in range(num_epochs):     
        train_losses = train(model,train_loader,criterion,optimizer)
        valid_losses = valid(model,valid_loader,criterion)

        train_loss = np.average(train_losses)
        valid_loss = np.average(valid_losses)
        
        avg_train_losses.append(train_loss)
        avg_valid_losses.append(valid_loss)
        
        epoch_len = len(str(num_epochs))
        
        print_msg = (f'[{epoch:>{epoch_len}}/{num_epochs:>{epoch_len}}]' + f'train_loss: {train_loss:.5f}' + f'valid_loss: {valid_loss:.5f}')
        print(print_msg)
        
        
        early_stopping(valid_loss,model)
        
        if early_stopping.early_stop:
            print("Early Stopping")
            break
        
    model.load_state_dict(torch.load('checkpoint.pt'))
    return model, avg_train_losses, avg_valid_losses

In [23]:
batch_size = 100 #256
n_epochs = 1
#PATH="/opt/resort/rikako/autoencoder/data/models/params"

#train_loader, test_loader, valid_loader = create_datasets(batch_size)
#train_loader, valid_loader = create_datasets(batch_size)

# early stopping patience; how long to wait after last time validation loss improved.
patience = 20

#PATH
model, train_loss, valid_loss = train_model(model, batch_size, train_loader, valid_loader, patience, criterion, optimizer, num_epochs)
#なんでやろ

[0/1]train_loss: 0.03990valid_loss: 2.20443
Validation loss decreased (inf --> 2.204427).  Saving model ...


In [None]:
import matplotlib.pyplot as plt

# visualize the loss as the network trained
fig = plt.figure(figsize=(10,8))
plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
plt.plot(range(1,len(valid_loss)+1),valid_loss,label='Validation Loss')

# find position of lowest validation loss
minposs = valid_loss.index(min(valid_loss))+1 
plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')

plt.xlabel('epochs')
plt.ylabel('loss')
plt.ylim(0, 0.5) # consistent scale
plt.xlim(0, len(train_loss)+1) # consistent scale
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
#fig.savefig('loss_plot.png', bbox_inches='tight')