In [10]:
import numpy as np
import copy
import random
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import os
import pandas
import h5py
import librosa.display
import IPython.display as ipd
import skimage.io
from PIL import Image
import pickle
import scipy.io.wavfile as wav
import scipy.signal as signal


RAWDATA_PATH = '..\data\Glosy_zbior'
SAVE_PATH = '..\data\\'

peoples = ['Adrian',
           'Bartek',
           'Damian',
           'Ewelina',
           'Hubert',
           'jakub',
           'Kamil',
           'Kasia',
           'Kuba',
           'Lukasz',
           'Mariusz',
           'Mikolaj',
           'oskar',
           'patryk',
           'Pawel',
           'przemek',
           'Rafal',
           'szymon']

words = ['background',
         'close',
         'door',
         'down',
         'go',
         'home',
         'left',
         'light',
         'no',
         'off',
         'on',
         'open',
         'right',
         'shutdown',
         'silence',
         'speech',
         'stop',
         'unknown',
         'up',
         'windows',
         'yes',
         ]

def save_to_pickle(path, group_name, samples):
    filename = path + '\\' + group_name + '.pickle'
    with open(filename, 'wb') as f:
        pickle.dump(samples, f)

    # try:
    #     with h5py.File(path, 'r+') as hdf:
    #         samples_group = hdf.create_group(group_name +'_samples')
    #
    #         for i, arr in enumerate(samples):
    #             samples_group.create_dataset(str(i), data=arr)
    #
    #         hdf.close()
    # except:
    #     with h5py.File(path, 'w') as hdf:
    #         samples_group = hdf.create_group(group_name +'_samples')
    #
    #         for i, arr in enumerate(samples):
    #             samples_group.create_dataset(str(i), data=arr)
    #
    #         hdf.close()

labels = commands

In [11]:
def mel_from_file(sample_rate, audio_data):
    pre_emphasis = 0.97
    emphasized_audio = np.append(audio_data[0], audio_data[1:] - pre_emphasis * audio_data[:-1])
    frame_length = 0.025  # Length of each frame in seconds
    frame_shift = 0.01  # Shift between consecutive frames in seconds
    fft_size = 512  # Size of the FFT (Fast Fourier Transform)
    samples_per_frame = int(frame_length * sample_rate)
    samples_per_shift = int(frame_shift * sample_rate)
    window = signal.hamming(samples_per_frame)
    num_frames = 1 + int((len(emphasized_audio) - samples_per_frame) / samples_per_shift)
    frames = np.zeros((num_frames, samples_per_frame))

    for i in range(num_frames):
        start = i * samples_per_shift
        frames[i] = emphasized_audio[start : start + samples_per_frame] * window
    spectra = np.abs(np.fft.rfft(frames, n=fft_size))
    power_spectra = spectra ** 2
    power_spectra_db = 10 * np.log10(1 + power_spectra)

    return power_spectra_db


In [12]:
# nonaugmented
spectros = []
labels = []
samples = []
for root_dir, cur_dir, files in os.walk(RAWDATA_PATH):
    for file in tqdm(files):
        temp = file.split('_')
        name = temp[0]
        command = temp[1]
        sample = mel_from_file(*wav.read(root_dir + '\\' + file))
        samples.append((sample, command, name))

save_to_pickle(path=SAVE_PATH, group_name='commands_no_aug', samples=samples)

  sample = mel_from_file(*wav.read(root_dir + '\\' + file))
100%|██████████| 1134/1134 [00:04<00:00, 274.71it/s]


In [15]:
# augmented
augmented = []

NOISE_RANGE = (0, 2)
pitch_shift_ranges = np.linspace(-2.5, 2.5, num=3)
time_shift_ranges = np.linspace(0.8, 1.2, num=3)

for root_dir, cur_dir, files in os.walk(RAWDATA_PATH):
    for file in tqdm(files):
        temp = file.split('_')
        name = temp[0]
        command = temp[1]
        sample_rate, audio_data = wav.read(root_dir + '\\' + file)
        sample = mel_from_file(sample_rate, audio_data)
        augmented.append((sample, command, name))
        for _ in range(10):
            noised_sample = audio_data * np.random.uniform(0, 2, size=audio_data.shape)
            sample = mel_from_file(sample_rate, noised_sample)
            augmented.append((sample, command, name))

save_to_pickle(path=SAVE_PATH, group_name='commands_aug', samples=samples)

  sample_rate, audio_data = wav.read(root_dir + '\\' + file)
100%|██████████| 1134/1134 [02:36<00:00,  7.22it/s]


labels = persons

In [4]:
# nonaugmented
spectros = []
labels = []

for root_dir, cur_dir, files in os.walk(RAWDATA_PATH):
    for file in tqdm(files):
        for person in peoples:
            if str(person.lower() + '_') in file.lower():
                labels.append(person)
                y, sr = librosa.load(root_dir + '\\' + file)
                sample = librosa.stft(y)
                sample = librosa.amplitude_to_db(np.abs(sample), ref=np.max)
                spectros.append(sample)
                break

save_to_h5(path=SAVE_PATH, group_name='peoples_no_aug', spectros=spectros, labels=labels)

100%|██████████| 1134/1134 [00:03<00:00, 323.81it/s]


In [5]:
# augmented
spectros = []
labels = []

NOISE_RANGE = (0, 2)
pitch_shift_ranges = np.linspace(-2.5, 2.5, num=3)
time_shift_ranges = np.linspace(0.8, 1.2, num=3)

for root_dir, cur_dir, files in os.walk(RAWDATA_PATH):
    for file in tqdm(files):
        for person in peoples:
            if str(person.lower() + '_') in file.lower():
                y, sr = librosa.load(root_dir + '\\' + file)
                for pitch_shift in pitch_shift_ranges:
                    for time_shift in time_shift_ranges:
                        sample = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift)
                        sample = librosa.effects.time_stretch(sample, rate=time_shift)
                        sample = sample * np.random.uniform(*NOISE_RANGE, size=sample.shape)
                        sample = librosa.stft(sample)
                        sample = librosa.amplitude_to_db(np.abs(sample), ref=np.max)
                        spectros.append(sample)
                        labels.append(person)
                break

save_to_h5(path=SAVE_PATH, group_name='peoples_aug', spectros=spectros, labels=labels)

100%|██████████| 1134/1134 [03:52<00:00,  4.87it/s]


In [None]:
len(spectros)

In [48]:
with h5py.File('E:\iot\data\spectrograms.h5', 'r') as hdf:
    print(hdf['peoples_aug_labels']['10'][()].decode('utf-8'))

background


In [None]:
# from collections import Counter
# counter_object = Counter(labels_list)
# keys = counter_object.keys()
# print(keys, counter_object.values())
# len(labels_list)