In [1]:
import warnings
warnings.filterwarnings("ignore")

# import librosa
import torchaudio as ta
ta.set_audio_backend("sox_io")
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd.profiler as profiler
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler, BatchSampler, RandomSampler
from pathlib import Path

import numpy as np
import os

import IPython.display as ipd

import numpy as np

import math

import glob

from tqdm.auto import tqdm

# from python_files.Noise_Reduction_Datagen_paths import Signal_Synthesis_DataGen
# from python_files.unet_basic import Model

import warnings

# warnings.filterwarnings("ignore")
import warnings

import gc

from random import shuffle
import random
import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Tuple, Dict, Union, Optional, Callable, Any

from datasets import load_dataset, get_dataset_split_names
import huggingface_hub

# from numba import jit

In [2]:
# huggingface_hub.notebook_login()
HF_DATASETS_CACHE_DIR = "/mnt/nvme0n1p1/Cache/huggingface/datasets"
os.makedirs(HF_DATASETS_CACHE_DIR, exist_ok=True)

In [3]:
#takes the Dataframe of Clean SPeech and Noise Speech and returns noise added speech along with Random SNR noise added, Also the clean speech signal is returned
class SignalSynthesisDataset(Dataset):
    def __init__(self, clean_df: pd.DataFrame, noise_df: pd.DataFrame, \
                noise_path: Union[str, Path], clean_path: Union[str, Path], \
                sample_time: int, sr: int=16000, noise_snr_range: List[int]=[-5, 15], \
                noise_snr_prob: float=0.5):
        
        self.clean_df = clean_df
        self.noise_df = noise_df
        self.noise_path = noise_path
        self.clean_path = clean_path
        self.sample_time = sample_time
        self.sr = sr
        self.noise_snr_range = noise_snr_range
        self.noise_snr_prob = noise_snr_prob

    def get_ids(self, idx):
        signal_id = idx//len(self.noise_df)
        noise_id = idx - signal_id * len(self.noise_df)

        return signal_id, noise_id
    
    def get_signal(self, signal_id, df, path) -> Tuple[List[torch.Tensor], torch.Tensor]:
        signal_name = df.iloc[signal_id]["path"]
        signal_path = path / signal_name
        signal, sr = ta.load(signal_path)
        if sr != self.sr:
            warnings.warn("Resampling the signal to 16KHz")
            signal = ta.transforms.Resample(sr, self.sr)(signal)
        signal = signal[0]
        return signal, sr
    
    def adjust_clean_signal_length(self, signal: torch.Tensor) -> torch.Tensor:

        final_len = int(self.sr * self.sample_time)
        if len(signal) > final_len:
            signal = signal[:final_len]
        else:
            
            add_len = final_len - len(signal)
            zeros_signal = torch.zeros(add_len)
            signal = signal.numpy()
            signal = np.append(signal, (zeros_signal))
            signal = torch.from_numpy(signal)

        return signal
    
    def adjust_noise_signal_length(self, signal: torch.Tensor) -> torch.Tensor:

        final_len = int(self.sr * self.sample_time)
        if len(signal) > final_len:
            signal = signal[:final_len]
        else:
            signal_buffer = np.zeros(final_len)
            signa = signal.numpy()
            for i in range(final_len//len(signal)):
                signal_buffer[i*signal : (i+1)*signal] = signal
            signal_buffer[(i+1)*signal:] = signal[:final_len - (i+1)*signal]
            signal = torch.from_numpy(signal_buffer)

        return signal

    def get_mixed_signal(self, clean_signal: torch.Tensor, noise_signal: torch.Tensor, snr):
        clean_signal_power = clean_signal.norm(2)
        noise_signal_power = noise_signal.norm(2)

        scale = snr * clean_signal_power / noise_signal_power
        mixed_signal = (scale * clean_signal + noise_signal) / 2
        return mixed_signal
    
    def signals_alchemy(self, signal_id, noise_id, snr):
        
        clean_signal, clean_sr = self.get_signal(signal_id, self.clean_df, self.clean_path)
        noise_signal, noise_sr = self.get_signal(noise_id, self.noise_df, self.noise_path)
        clean_signal = self.adjust_clean_signal_length(clean_signal)
        noise_signal = self.adjust_noise_signal_length(noise_signal)

        noise_signal = self.get_mixed_signal(clean_signal, noise_signal, snr)



        return clean_signal, noise_signal


    def __len__(self):
        return len(self.df) * len(self.noise_df)
    
    def __getitem__(self, idx):

        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        signal_id, noise_id = self.get_ids(idx)
        snr = random.uniform(self.noise_snr_range[0], self.noise_snr_range[1])
        clean_signal, noisy_signal = self.signals_alchemy(signal_id, noise_id, snr)

        return clean_signal, noisy_signal





In [13]:
CLEAN_SAMPLES_PATH = Path("/mnt/nvme1n1p2/vamsik1211/Data/git-repos/Noise-Reduction-Deep-Learning/dataset/dataset/cv-corpus-5.1-2020-06-22-Resampled/en/clips")
NOISE_SAMPLES_PATH = Path("/mnt/nvme1n1p2/vamsik1211/Data/git-repos/Noise-Reduction-Deep-Learning/dataset/dataset/UrbanSound8K-Resampled/all_files")

train_df = pd.read_csv("dataset/dataset/cv-corpus-5.1-2020-06-22-Resampled/en/train.tsv", sep="\t")
train_df = train_df[train_df["up_votes"] > 1]
train_df = train_df[train_df["segment"] != 'Singleword Benchmark']
train_df = train_df[train_df["down_votes"] <= 1]


test_df = pd.read_csv("dataset/dataset/cv-corpus-5.1-2020-06-22-Resampled/en/test.tsv", sep="\t")
test_df = test_df[test_df["up_votes"] > 1]
test_df = test_df[test_df["segment"] != 'Singleword Benchmark']
test_df = test_df[test_df["down_votes"] < 3]


sample_time = 5 # secs
sample_rate = 16000
noise_snr_range = [-5, 15]
noise_snr_prob = 0.5

print(train_df.shape)


signal_synthesis_dataset = SignalSynthesisDataset(train_df, train_df, NOISE_SAMPLES_PATH, CLEAN_SAMPLES_PATH, sample_time, sample_rate, noise_snr_range, noise_snr_prob)

(434983, 10)


In [None]:
signal_synthesis_dataset.__getitem__(1)

In [9]:
!export HF_HOME="/home/vamsik1211/Data/Cache/huggingface"

In [4]:
dataset = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="train", streaming=False, cache_dir=HF_DATASETS_CACHE_DIR)

Found cached dataset common_voice_13_0 (/mnt/nvme0n1p1/Cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/en/13.0.0/22809012aac1fc9803eaffc44122e4149043748e93933935d5ea19898587e4d7)


In [5]:
r_sampler = RandomSampler(dataset, replacement=False)

In [6]:
dataset[183749]["audio"]["array"].max()

0.1615888923406601

In [7]:
mean = 0.0
std = 0.0

# dataloader = DataLoader(dataset, batch_size=32, num_workers=12)
# batch_sampler = BatchSampler(RandomSampler(dataset), batch_size=32, drop_last=False)
# dataloader = DataLoader(dataset, batch_sampler=batch_sampler)


for signals in tqdm(dataloader):
    signal = signals["audio"]["array"]
    # mean += signal.mean().sum()
    # std += signal.std().sum()

NameError: name 'dataloader' is not defined

In [7]:
cv_13?

[0;31mType:[0m        Dataset
[0;31mString form:[0m
Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
    num_rows: 4479
})
[0;31mLength:[0m      4479
[0;31mFile:[0m        ~/Data/miniconda3/envs/kaggle-env/lib/python3.8/site-packages/datasets/arrow_dataset.py
[0;31mDocstring:[0m   A Dataset backed by an Arrow table.


In [26]:
r_sampler

<torch.utils.data.sampler.RandomSampler at 0x7f2aeeb57bb0>

In [16]:
DataLoader?

[0;31mInit signature:[0m [0mDataLoader[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwds[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Data loader. Combines a dataset and a sampler, and provides an iterable over
the given dataset.

The :class:`~torch.utils.data.DataLoader` supports both map-style and
iterable-style datasets with single- or multi-process loading, customizing
loading order and optional automatic batching (collation) and memory pinning.

See :py:mod:`torch.utils.data` documentation page for more details.

Args:
    dataset (Dataset): dataset from which to load the data.
    batch_size (int, optional): how many samples per batch to load
        (default: ``1``).
    shuffle (bool, optional): set to ``True`` to have the data reshuffled
        at every epoch (default: ``False``).
    sampler (Sampler or Iterable, optional): defines the strategy to draw
        samples from the dataset. Can be any ``Iterable`` with ``__len__`

In [12]:
get_dataset_split_names("mozilla-foundation/common_voice_13_0", "en")

Downloading builder script:   0%|          | 0.00/8.35k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

['train', 'validation', 'test', 'other', 'invalidated']

In [7]:
import huggingface_hub

In [8]:
huggingface_hub.login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /home/vamsik1211/.huggingface/token
Login successful
