In [1]:
import torch
from torch.utils.tensorboard import SummaryWriter
# from python_files.unet_basic import Model
from python_files.Noise_Reduction_Datagen_paths import Signal_Synthesis_DataGen
from fastai.text.all import *
from fastai.data.core import DataLoaders
import numpy as np

import os
from glob import glob

In [2]:
def get_paths(dir_path):
    paths = []
    for root, dirs, files in os.walk(dir_path):
        for name in files:
            if name.endswith(".wav") or name.endswith(".mp3"):
                paths.append(os.path.join(root, name))
                
    paths = np.asarray(paths)
    return paths

In [3]:
np_paths = np.load("./dataset_loader_files/signal_paths_nums_save.npy")

In [4]:
noise_paths = get_paths("./dataset/Reduced_noise/")[:1000]
signal_paths = get_paths("./dataset/Reduced_clean_signals/")[:1000]
signal_dir = ""#"./dataset/cv-corpus-5.1-2020-06-22-Resampled/en/clips"
noise_save_path = ""#"./dataset_loader_files/noise_paths_resampled_save.npy"
train = True
n_fft=1024
win_length=n_fft
hop_len=n_fft//4
create_specgram = False
perform_stft = False
default_sr = 16000
sec = (16384/default_sr)*2
augment=True
device_datagen = "cpu"

train_ds = Signal_Synthesis_DataGen(noise_paths, signal_paths, signal_dir, \
                 n_fft=n_fft, win_length=win_length, hop_len=hop_len, create_specgram=create_specgram, \
                 perform_stft=perform_stft, normalize=True, default_sr=default_sr, sec=sec, epsilon=1e-5, augment=False, device=device_datagen)

1000 ./dataset/Reduced_clean_signals/common_voice_en_22252098.mp3


In [5]:
noise_paths = get_paths("./dataset/Reduced_noise/")[1200:1300]
signal_paths = get_paths("./dataset/Reduced_clean_signals/")[6000:6100]
signal_dir = ""#"./dataset/cv-corpus-5.1-2020-06-22-Resampled/en/clips"
noise_save_path = ""#"./dataset_loader_files/noise_paths_resampled_save.npy"
train = False

val_ds = Signal_Synthesis_DataGen(noise_paths, signal_paths, signal_dir,\
                 n_fft=n_fft, win_length=win_length, hop_len=hop_len, create_specgram=create_specgram, \
                 perform_stft=perform_stft, normalize=True, default_sr=default_sr, sec=sec, epsilon=1e-5, augment=False, device=device_datagen)

100 ./dataset/Reduced_clean_signals/common_voice_en_22347670.mp3


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BATCH_SIZE = 400
shuffle = True
num_workers = 0
pin_memory = False

# data_loader = DataLoader(signal_synthesis_dataset, batch_size=BATCH_SIZE, shuffle=shuffle, num_workers=num_workers)
# data_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory)

dls = DataLoaders.from_dsets(train_ds, val_ds, bs=BATCH_SIZE, num_workers=num_workers, pin_memory=pin_memory).to("cpu")

In [7]:
# %%time
# for data in dls.train:
#     print(data[0].max(), data[0].min())

In [8]:
class Mod_MSELoss(nn.Module):
    def __init__(self, mul_factor):
        super(Mod_MSELoss, self).__init__()
        self.loss_fn = nn.MSELoss(reduction="mean")
        self.mul_factor = mul_factor
        
        
    def forward(self, sig_pred, sig_true):
        loss = self.loss_fn(sig_pred, sig_true)
        loss = self.mul_factor*loss
        return loss

In [9]:
class DownSamplingLayer(nn.Module):
    def __init__(self, channel_in, channel_out, dilation=1, kernel_size=15, stride=1, padding=7):
        super(DownSamplingLayer, self).__init__()
        self.main = nn.Sequential(
            nn.Conv1d(channel_in, channel_out, kernel_size=kernel_size,
                      stride=stride, padding=padding, dilation=dilation),
            nn.BatchNorm1d(channel_out),
            nn.LeakyReLU(negative_slope=0.1)
        )

    def forward(self, ipt):
        return self.main(ipt)

class UpSamplingLayer(nn.Module):
    def __init__(self, channel_in, channel_out, kernel_size=5, stride=1, padding=2):
        super(UpSamplingLayer, self).__init__()
        self.main = nn.Sequential(
            nn.Conv1d(channel_in, channel_out, kernel_size=kernel_size,
                      stride=stride, padding=padding),
            nn.BatchNorm1d(channel_out),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
        )

    def forward(self, ipt):
        return self.main(ipt)

class Model(nn.Module):
    def __init__(self, n_layers=12, channels_interval=24):
        super(Model, self).__init__()
        self.n_layers = n_layers
        self.channels_interval = channels_interval
        encoder_in_channels_list = [1] + [i * self.channels_interval for i in range(1, self.n_layers)]
        encoder_out_channels_list = [i * self.channels_interval for i in range(1, self.n_layers + 1)]
        range_list = [i for i in range(self.n_layers)]
        #          1    => 2    => 3    => 4    => 5    => 6   => 7   => 8   => 9  => 10 => 11 =>12
        # 16384 => 8192 => 4096 => 2048 => 1024 => 512 => 256 => 128 => 64 => 32 => 16 =>  8 => 4
        self.encoder = nn.ModuleList()
        for i in range(self.n_layers):
            self.encoder.append(
                DownSamplingLayer(
                    channel_in=encoder_in_channels_list[i],
                    channel_out=encoder_out_channels_list[i]
                )
            )

        self.middle = nn.Sequential(
            nn.Conv1d(self.n_layers * self.channels_interval, self.n_layers * self.channels_interval, 15, stride=1,
                      padding=7),
            nn.BatchNorm1d(self.n_layers * self.channels_interval),
            nn.LeakyReLU(negative_slope=0.1, inplace=True)
        )

        decoder_in_channels_list = [(2 * i + 1) * self.channels_interval for i in range(1, self.n_layers)] + [
            2 * self.n_layers * self.channels_interval]
        decoder_in_channels_list = decoder_in_channels_list[::-1]
        decoder_out_channels_list = encoder_out_channels_list[::-1]
        self.decoder = nn.ModuleList()
        for i in range(self.n_layers):
            self.decoder.append(
                UpSamplingLayer(
                    channel_in=decoder_in_channels_list[i],
                    channel_out=decoder_out_channels_list[i]
                )
            )

        self.out = nn.Sequential(
            nn.Conv1d(1 + self.channels_interval, 1, kernel_size=1, stride=1),
            nn.Sigmoid()
        )
        
    def forward(self, input):
        tmp = []
        o = input
        for i, _ in enumerate(self.encoder.children()):
            o = self.encoder[i](o)
            tmp.append(o)
            o = o[:, :, ::2]

        o = self.middle(o)

        for i, _ in enumerate(self.decoder.children()):
            o = F.interpolate(o, scale_factor=2, mode="linear", align_corners=True)
            o = torch.cat([o, tmp[self.n_layers - i -1]], dim=1)
            o = self.decoder[i](o)

        o = torch.cat([o, input], dim=1)
        o = self.out(o)
        return o


In [11]:

model = Model(12, 24)
model.load_state_dict(torch.load("./Model_saves/torch_model_save_large_dataset.pt"))
model.to("cpu")


optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0)
criterion = Mod_MSELoss(mul_factor=1000)
n_epochs=100


model.train()
scaler = torch.cuda.amp.GradScaler()

In [12]:
writer = SummaryWriter("./runs/CNN_model_experiment_1")

In [13]:
%%time
for noise, sig in dls.train:
    break

CPU times: user 11.6 s, sys: 144 ms, total: 11.8 s
Wall time: 2.02 s


In [14]:
writer.add_graph(model, noise)
writer.close()

In [15]:
writer.add_audio("runs/CNN_model_experiment_1", noise[0].squeeze().t().to("cpu").numpy(), sample_rate=16000)