In [70]:
from torch.utils.data import Dataset, DataLoader
import os
import librosa
import numpy as np


class MyDataset(Dataset):

    def __init__(self, data_dir,batch_size,sample_rate,segment):

        super(MyDataset, self).__init__()

        self.data_dir = data_dir
        self.sr = sample_rate
        self.batch_size=batch_size
        self.segment=segment

        file = ["mix", "s1", "s2"]

        self.mix_dir = os.path.join(data_dir, file[0])
        self.mix_list = os.listdir(os.path.abspath(self.mix_dir))

        self.s1_dir = os.path.join(data_dir, file[1])
        self.s1_list = os.listdir(os.path.abspath(self.s1_dir))

        self.s2_dir = os.path.join(data_dir, file[2])
        self.s2_list = os.listdir(os.path.abspath(self.s2_dir))

    def __getitem__(self, item):

        mix_path = os.path.join(self.mix_dir, self.mix_list[item])
        mix_data = librosa.load(path=mix_path,
                                sr=self.sr,
                                mono=True,  # Single channel
                                offset=0,  # Audio read the starting point
                                duration=None,  # Get audio time
                                dtype=np.float32,
                                res_type="kaiser_best",
                                )[0]
        length = len(mix_data)

        s1_path = os.path.join(self.s1_dir, self.s1_list[item])
        s1_data = librosa.load(path=s1_path,
                               sr=self.sr,
                               mono=True,  # 单通道
                               offset=0,  # 音频读取起始点
                               duration=None,  # 获取音频时长
                               )[0]

        s2_path = os.path.join(self.s2_dir, self.s2_list[item])
        s2_data = librosa.load(path=s2_path,
                               sr=self.sr,
                               mono=True,  # 单通道
                               offset=0,  # 音频读取起始点
                               duration=None,  # 获取音频时长
                               )[0]

        s_data = np.stack((s1_data, s2_data), axis=0)

        return mix_data, length, s_data

    def __len__(self):

        return len(self.mix_list)


In [22]:
import json
class AudioDataset(Dataset):

    def __init__(self, json_dir, batch_size, sample_rate=8000, segment=4.0, cv_maxlen=8.0):
        """
        Args:
            json_dir: directory including mix.json, s1.json and s2.json
            segment: duration of audio segment, when set to -1, use full audio
        xxx_infos is a list and each item is a tuple (wav_file, #samples)
        """
        super(AudioDataset, self).__init__()
        mix_json = os.path.join(json_dir, 'mix.json')
        s1_json = os.path.join(json_dir, 's1.json')
        s2_json = os.path.join(json_dir, 's2.json')
        with open(mix_json, 'r') as f:
            mix_infos = json.load(f)
        with open(s1_json, 'r') as f:
            s1_infos = json.load(f)
        with open(s2_json, 'r') as f:
            s2_infos = json.load(f)
        # sort it by #samples (impl bucket)
        def sort(infos): return sorted(
            infos, key=lambda info: int(info[1]), reverse=True)
        sorted_mix_infos = sort(mix_infos)
        sorted_s1_infos = sort(s1_infos)
        sorted_s2_infos = sort(s2_infos)
        if segment >= 0.0:
            # segment length and count dropped utts
            # 4s * 8000/s = 32000 samples
            segment_len = int(segment * sample_rate)
            drop_utt, drop_len = 0, 0
            for _, sample in sorted_mix_infos:
                if sample < segment_len:
                    drop_utt += 1
                    drop_len += sample
            print("Drop {} utts({:.2f} h) which is short than {} samples".format(
                drop_utt, drop_len/sample_rate/36000, segment_len))
            # generate minibach infomations
            minibatch = []
            start = 0
            while True:
                num_segments = 0
                end = start
                part_mix, part_s1, part_s2 = [], [], []
                while num_segments < batch_size and end < len(sorted_mix_infos):
                    utt_len = int(sorted_mix_infos[end][1])
                    if utt_len >= segment_len:  # skip too short utt
                        num_segments += math.ceil(utt_len / segment_len)
                        # Ensure num_segments is less than batch_size
                        if num_segments > batch_size:
                            # if num_segments of 1st audio > batch_size, skip it
                            if start == end:
                                part_mix.append(sorted_mix_infos[end])
                                part_s1.append(sorted_s1_infos[end])
                                part_s2.append(sorted_s2_infos[end])
                                end += 1
                            break
                        part_mix.append(sorted_mix_infos[end])
                        part_s1.append(sorted_s1_infos[end])
                        part_s2.append(sorted_s2_infos[end])
                    end += 1
                if len(part_mix) > 0:
                    minibatch.append([part_mix, part_s1, part_s2,
                                      sample_rate, segment_len])
                if end == len(sorted_mix_infos):
                    break
                start = end
            self.minibatch = minibatch
        else:  # Load full utterance but not segment
            # generate minibach infomations
            minibatch = []
            start = 0
            while True and start < len(sorted_mix_infos):
                end = min(len(sorted_mix_infos), start + batch_size)
                # Skip long audio to avoid out-of-memory issue
                # if int(sorted_mix_infos[start][1]) > cv_maxlen * sample_rate:
                #     start = end
                #     continue
                # skip too short utt
                if int(sorted_mix_infos[start][1]) < 4*8000:
                    start = end
                    continue
                minibatch.append([sorted_mix_infos[start:end],
                                  sorted_s1_infos[start:end],
                                  sorted_s2_infos[start:end],
                                  sample_rate, segment])
                if end == len(sorted_mix_infos):
                    break
                start = end
            self.minibatch = minibatch

    def __getitem__(self, index):
        return self.minibatch[index]

    def __len__(self):
        return len(self.minibatch)

In [23]:
import torch

class AudioDataLoader(DataLoader):
    """
    NOTE: just use batchsize=1 here, so drop_last=True makes no sense here.
    """

    def __init__(self, *args, **kwargs):
        super(AudioDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = _collate_fn

def _collate_fn(batch):
    """
    Args:
        batch: list, len(batch) = 1. See AudioDataset.__getitem__()
    Returns:
        mixtures_pad: B x T, torch.Tensor
        ilens : B, torch.Tentor
        sources_pad: B x C x T, torch.Tensor
    """
    # batch should be located in list
    assert len(batch) == 1
    mixtures, sources = load_mixtures_and_sources(batch[0])

    # get batch of lengths of input sequences
    ilens = np.array([mix.shape[0] for mix in mixtures])

    # perform padding and convert to tensor
    pad_value = 0
    mixtures_pad = pad_list([torch.from_numpy(mix).float()
                             for mix in mixtures], pad_value)
    ilens = torch.from_numpy(ilens)
    sources_pad = pad_list([torch.from_numpy(s).float()
                            for s in sources], pad_value)
    # N x T x C -> N x C x T
    sources_pad = sources_pad.permute((0, 2, 1)).contiguous()
    #print('mixtures_pad.shape {}'.format(mixtures_pad.shape))
    #print('ilens {}'.format(ilens))
    return mixtures_pad, ilens, sources_pad


def load_mixtures_and_sources(batch):
    """
    Each info include wav path and wav duration.
    Returns:
        mixtures: a list containing B items, each item is T np.ndarray
        sources: a list containing B items, each item is T x C np.ndarray
        T varies from item to item.
    """
    mixtures, sources = [], []
    mix_infos, s1_infos, s2_infos, sample_rate, segment_len = batch
    # for each utterance
    for mix_info, s1_info, s2_info in zip(mix_infos, s1_infos, s2_infos):
        mix_path = mix_info[0]
        s1_path = s1_info[0]
        s2_path = s2_info[0]
        assert mix_info[1] == s1_info[1] and s1_info[1] == s2_info[1]
        # read wav file
        mix, _ = librosa.load(mix_path, sr=sample_rate)
        s1, _ = librosa.load(s1_path, sr=sample_rate)
        s2, _ = librosa.load(s2_path, sr=sample_rate)
        # merge s1 and s2
        s = np.dstack((s1, s2))[0]  # T x C, C = 2
        utt_len = mix.shape[-1]
        if segment_len >= 0:
            # segment
            for i in range(0, utt_len - segment_len + 1, segment_len):
                mixtures.append(mix[i:i+segment_len])
                sources.append(s[i:i+segment_len])
            if utt_len % segment_len != 0:
                mixtures.append(mix[-segment_len:])
                sources.append(s[-segment_len:])
        else:  # full utterance
            mixtures.append(mix[:4*8000])
            sources.append(s[:4*8000, :])
    return mixtures, sources

def pad_list(xs, pad_value):
    n_batch = len(xs)
    max_len = max(x.size(0) for x in xs)
    pad = xs[0].new(n_batch, max_len, * xs[0].size()[1:]).fill_(pad_value)
    for i in range(n_batch):
        pad[i, :xs[i].size(0)] = xs[i]
    return pad

In [71]:
tr_dataset = MyDataset(data_dir="dataset/min/tr",  # The directory contains mix.json, s1.json, s2.json
                              batch_size=1,
                              sample_rate=8000,  # 采样率
                              segment=-1)  # 语音时长


In [73]:
tr_dataset.__getitem__(0)

(array([-0.00448608, -0.00491333, -0.00112915, ...,  0.00454712,
         0.00067139,  0.0045166 ], dtype=float32),
 88920,
 array([[ 0.00082397,  0.00064087,  0.00045776, ...,  0.        ,
          0.        ,  0.        ],
        [-0.00531006, -0.0055542 , -0.00158691, ...,  0.00454712,
          0.00067139,  0.0045166 ]], dtype=float32))

In [65]:
import os
os.listdir("dataset/json/tr")

['mix.json', 's1.json', 's2.json']

In [74]:
tr_loader=AudioDataLoader(tr_dataset,batch_size=1,
                                shuffle=True,
                                num_workers=0)

In [75]:
tr_loader.

TypeError: DataLoader.__setattr__() missing 2 required positional arguments: 'attr' and 'val'

In [58]:
tr_dataset[0]

[[['A:\\major\\Attention-Is-All-You-Need-In-Speech-Separation\\Attention-Is-All-You-Need-In-Speech-Separation\\dataset\\min\\tr\\mix\\57786.wav',
   237880]],
 [['A:\\major\\Attention-Is-All-You-Need-In-Speech-Separation\\Attention-Is-All-You-Need-In-Speech-Separation\\dataset\\min\\tr\\s1\\57786.wav',
   237880]],
 [['A:\\major\\Attention-Is-All-You-Need-In-Speech-Separation\\Attention-Is-All-You-Need-In-Speech-Separation\\dataset\\min\\tr\\s2\\57786.wav',
   237880]],
 8000,
 -1]

In [1]:
import torch

x=torch.tensor([1,3,341,414,14,141,14])

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
x=x[None,x]

IndexError: index 341 is out of bounds for dimension 0 with size 7

In [24]:
y=x.unsqueeze(0)

In [26]:
y

tensor([[  1,   3, 341, 414,  14, 141,  14]])

In [51]:
x=5
x=torch.tensor(x)

In [55]:
y=x.unsqueeze(0)

In [59]:
x.view(1)

tensor([5])

In [45]:
mask=x.new_ones((1, 1, 5))

In [50]:
y=mask[0,:,5:]

In [2]:
torch.cuda.is_available()

True

In [61]:
torch.zeros(1).cuda()

AssertionError: Torch not compiled with CUDA enabled

In [1]:

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("instial GPU usage")
    gpu_usage()
    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU usage after emptying the cache")
    gpu_usage()

    

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
import json

f=open("dataset/json/tr/s1.json")
data1=json.load(f)
b=open("dataset/json/tr/s2.json")
data2=json.load(b)

In [14]:
count=0
max_diff=0
for i in range(40000):

    if(data1[i][1]!=data2[i][1]):
        print(data1[i])
        print(data1[i][1])
        print(data2[i][1])
        count=count+1

print(count)   

['A:\\major\\Attention-Is-All-You-Need-In-Speech-Separation\\Attention-Is-All-You-Need-In-Speech-Separation\\dataset\\min\\tt\\s1\\100002.wav', 105720]
105720
105721
['A:\\major\\Attention-Is-All-You-Need-In-Speech-Separation\\Attention-Is-All-You-Need-In-Speech-Separation\\dataset\\min\\tt\\s1\\100319.wav', 116521]
116521
93200
['A:\\major\\Attention-Is-All-You-Need-In-Speech-Separation\\Attention-Is-All-You-Need-In-Speech-Separation\\dataset\\min\\tt\\s1\\100651.wav', 126920]
126920
126921
['A:\\major\\Attention-Is-All-You-Need-In-Speech-Separation\\Attention-Is-All-You-Need-In-Speech-Separation\\dataset\\min\\tt\\s1\\100804.wav', 125881]
125881
87680
['A:\\major\\Attention-Is-All-You-Need-In-Speech-Separation\\Attention-Is-All-You-Need-In-Speech-Separation\\dataset\\min\\tt\\s1\\101008.wav', 117561]
117561
116880
['A:\\major\\Attention-Is-All-You-Need-In-Speech-Separation\\Attention-Is-All-You-Need-In-Speech-Separation\\dataset\\min\\tt\\s1\\103269.wav', 122480]
122480
122481
['A:\\

In [2]:
import torch

x=5
x=torch.tensor(x)
x.cuda()

tensor(5, device='cuda:0')

torch.Tensor

In [19]:
torch.cuda.max_memory_allocated()

0

In [18]:
torch.cuda.current_device()

0