In [168]:
import numpy as np
import csv
from scipy.io import wavfile as wav
from scipy.fftpack import fft
from random import randint
import librosa
from librosa import display
from librosa import feature
import matplotlib.pyplot as plt
import matplotlib.image as img
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

SR = 16000

FileNames = {"train":"ml-fmi-23-2020//train.txt", "valid":"ml-fmi-23-2020//validation.txt", "test":"ml-fmi-23-2020//test.txt", "ex":"ml-fmi-23-2020//sample_submission.txt","pred":"ml-fmi-23-2020//predictions.txt"}

AudioFolders = {"train":"ml-fmi-23-2020//audio//train//", "valid":"ml-fmi-23-2020//audio//validation//", "test":"ml-fmi-23-2020//audio//test//"}

SpectogramFolders = {"train":"ml-fmi-23-2020//spectogram//train//", "valid":"ml-fmi-23-2020//spectogram//validation//", "test":"ml-fmi-23-2020//spectogram//test//"}

def readCsv (fileName, hasLables):
    data = []
    with open(fileName, "r", newline='\n') as csvfile:
        for row in csv.reader(csvfile, delimiter=','):
            data.extend(row)
    if hasLables:
        data = np.transpose(np.array(data).reshape((len(data)//2, 2))) 
    else:
        data = np.array(data)
    return data
def writeCsv (data, labels, fileName = FileNames['pred']):
    with open(fileName, "w", newline='\n') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['name', 'label'])
        for row in np.transpose([data, labels]):
            writer.writerow(row)

def audioToSpectograms (folderName, dataNames):
    i = 0
    for dataName in dataNames:
        i += 1
        if i % 100 == 0:
            print (i, '/', len(dataNames))
        audio, sr = librosa.load(AudioFolders[folderName]+dataName)
        audio_db  = librosa.amplitude_to_db(abs(librosa.stft(audio)))
        librosa.display.specshow(audio_db, sr=sr, cmap='gray')
        plt.savefig(SpectogramFolders[folderName]+dataName[:-3]+'png', bbox_inches='tight', pad_inches=0)
    print ('Done with', folderName)

def toBins(data, nrBins=8000, mi=0, ma=8000):
    bins = np.linspace(start = mi, stop = ma, num = nrBins)
    return np.digitize([d.reshape(-1) for d in data], bins) 
    # data_processed = (data - mi) / (ma - mi) # data to interval [0,1]
    # return np.multiply(data_processed, nrBins) // 1 # data to interval [0, bins]

def getData (folderName, dataNames):
    data = []
    for dataName in dataNames:
        audio = librosa.load(AudioFolders[folderName]+dataName)[0]
        data.append(librosa.feature.melspectrogram(y=audio, sr=SR, n_mels=128, fmax=8000))
        # data.append(librosa.amplitude_to_db(abs(librosa.stft(audio))).reshape(-1))
        # data.append(np.abs(fft(wav.read(AudioFolders[folderName]+dataName)[1])))
    return np.array(data)

In [106]:
# the len of each audio file is 16000 
train_data_name, train_labels = readCsv(FileNames['train'], hasLables=True)
valid_data_name, valid_labels = readCsv(FileNames['valid'], hasLables=True)
test_data_name                = readCsv(FileNames['test'],  hasLables=False)
# writeCsv(train_data, train_labels)

In [153]:
# train_data = getData('train', train_data_name)
# valid_data = getData('valid', valid_data_name)
# test_data  = getData('test',  test_data_name)

In [169]:
train = toBins(train_data)
valid = toBins(valid_data)
test  = toBins(test_data)

In [170]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(5)
knn.fit(train, train_labels)
# knn.score(valid_data, valid_labels)
valid_pred_knn = knn.predict(valid)
good_knn = np.argwhere(valid_pred_knn==valid_labels).reshape(-1)
print (len(good_knn)/len(valid_pred_knn))

0.515


In [None]:
good = np.argwhere(valid_pred==valid_labels).reshape(-1)
print (len(good)/len(valid_pred))

In [None]:
confusion_matrix(valid_pred, valid_labels)

In [None]:
def spectrogram (dataLen = 10, t=None):
    if t not in ['0', '1', None]:
        raise Exception ("UnknownType")
    for i in range (dataLen):
        curr = randint(0, len(train_data_name)-1)
        if t is not None:
            while train_labels[curr] != t:
                curr = randint(0, len(train_data_name)-1)
        
        x , sr = librosa.load(AudioFolders['train']+train_data_name[curr])

        # plt.figure(figsize=(14, 5))
        # librosa.display.waveplot(x, sr=sr)

        Xdb = librosa.amplitude_to_db(abs(librosa.stft(x)))
        # plt.figure(figsize=(14, 5))
        librosa.display.specshow(Xdb, sr=sr, cmap='gray', x_axis='time', y_axis='log')
        # plt.savefig('test.png', bbox_inches='tight', pad_inches=0)
        plt.colorbar()
spectrogram(1)

In [None]:
import librosa
import numpy
import skimage

def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

def spectrogram_image(y, sr, out, hop_length, n_mels):
    # use log-melspectrogram
    mels = librosa.feature.melspectrogram(y=y, sr=sr)
    mels = numpy.log(mels + 1e-9) # add small number to avoid log(0)

    # min-max scale to fit inside 8-bit range
    img = scale_minmax(mels, 0, 255).astype(numpy.uint8)
    img = numpy.flip(img, axis=0) # put low frequencies at the bottom in image
    img = 255-img # invert. make black==more energy

    # save as PNG
    plt.imshow(img)


if __name__ == '__main__':
    # settings
    hop_length = 512 # number of samples per time-step in spectrogram
    n_mels = 128 # number of bins in spectrogram. Height of image
    time_steps = 384 # number of time-steps. Width of image

    # load audio. Using example from librosa
    path = librosa.util.example_audio_file()
    y, sr = librosa.load(AudioFolders['train']+train_data_name[0]) 
    #y, sr = librosa.load(path, offset=1.0, duration=10.0, sr=22050)
    out = 'out.png'

    # extract a fixed length window
    start_sample = 0 # starting at beginning
    length_samples = time_steps*hop_length
    window = y#[start_sample:start_sample+length_samples]

    # convert to PNG
    spectrogram_image(window, sr=sr, out=out, hop_length=hop_length, n_mels=n_mels)
    print('wrote file', out)

In [108]:
# audio = librosa.load(AudioFolders['train']+train_data_name[0])[0]
# spect = librosa.amplitude_to_db(abs(librosa.stft(audio))).reshape(-1)

for i in range(10):
    
    y, sr = librosa.load(AudioFolders['train']+train_data_name[i]) 
    # C:\Users\costi\Anaconda3\Lib\site-packages\librosa\feature\spectral.py 1813
    S = librosa.feature.melspectrogram(y=y, sr=SR, n_mels=128, fmax=8000)
    # mels = numpy.log(mels + 1e-9) # add small number to avoid log(0)
    print (S.min(), S.max())
    S_scaled = toBins(S)
    # S_dB = librosa.power_to_db(S, ref=np.max)
    # print (S_dB.min(), S_dB.max())
    # s_db_scaled = toBins(S_dB)

    print (sum(sum(S_scaled)))
    # print (sum(sum(s_db_scaled)))

# plt.figure(figsize=(15, 4))
# librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=sr, fmax=8000)
# plt.colorbar(format='%+2.0f dB')
# plt.title('Mel-frequency spectrogram')
# plt.tight_layout()
# plt.show()

2.410839e-17 975.5328
745.0
5.1736915e-16 910.9207
206.0
8.45909e-15 1007.74854
565.0
4.9533473e-18 1342.1692
181.0
6.80572e-16 786.81805
340.0
1.7421274e-17 1388.3912
373.0
1.6051241e-17 893.05054
198.0
8.7040725e-18 1533.666
262.0
7.971942e-18 884.98065
58.0
8.3372744e-16 4879.922
577.0
