In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import soundfile as sf
import librosa.display
from glob import glob
import os
from tqdm import tqdm

In [2]:
def splitData(X, t, testFraction=0.2, randomize = False):
    """
    Split the data randomly into training and test sets
    Use numpy functions only
    Inputs:
        X: (np array of len Nsamples) input feature vectors
        t: (np array of len Nsamples) targets; one hot vectors
        testFraction: (float) Nsamples_test = testFraction * Nsamples
    Outputs:
        X_train: training set
        X_test: test set
        t_train: training labels
        t_test: test labels
    """
    if randomize is False:
      tot_samples = np.random.RandomState(seed=42).permutation(len(X))
    else:
      tot_samples = np.random.permutation(len(X))

    X_train = []
    X_test = []
    t_train = []
    t_test = []
    
    test_samples = max(1,int(len(X)*testFraction))
    # print(tot_samples[:test_samples])
    # print(tot_samples[test_samples:])

    for i in range(test_samples):
      X_test.append(X[tot_samples[i]])
      t_test.append(t[tot_samples[i]])

    for i in range(test_samples,len(X)):
      X_train.append(X[tot_samples[i]])
      t_train.append(t[tot_samples[i]])
    
    return np.asarray(X_train), np.asarray(t_train), np.asarray(X_test), np.asarray(t_test)

def calc_spec(x):
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann'))
    X = librosa.power_to_db(X**2,ref=np.max)
    return X

def audio2spec(x, norm=True, mfcc=True):
    
    '''
    Compute Mel-frequency cepstral coefficients (MFCCs)
    Inputs:
        x: np array of shape (Nsamples,)
    Output:
        X: (np array) spectrogram sequence
    '''
    X=[]
    for sample in x:
      if mfcc is False:
        X.append(calc_spec(sample))
      else:
        X.append(librosa.feature.mfcc(S=calc_spec(sample), n_mfcc=20))  
    if norm is True:
      X = (X-np.mean(X))/np.std(X)
    return np.asarray(X)

In [3]:
# PARAMETER FOR STFT
SAMPLING_RATE=16000
DURATION=10
N_FFT = 1024
WIN_LENGTH = 1024
HOP_LENGTH = 512

# LOADING THE TRAINING SPECTROGRAMS
X = np.load('/content/drive/MyDrive/Sem 5/EE603/project/val_set/Final_Dataset/training_spectrogram.npy')
print(X.shape)
y = np.load('/content/drive/MyDrive/Sem 5/EE603/project/val_set/Final_Dataset/training_labels.npy')
print(y.shape)
# from sklearn.utils import shuffle
# X, y = shuffle(X, y)

# EXTRACTING THE MFCC FEATURES FROM THE SPECTROGRAM
def get_mfcc(X):
  X_new = []
  for i in range(len(X)):
    mel_spectrogram = librosa.feature.melspectrogram(S=X[i], n_fft=N_FFT, hop_length=HOP_LENGTH)
    mfcc = librosa.feature.mfcc(S=mel_spectrogram, sr=16000,n_mfcc=20)
    mean = np.average(mfcc)
    std = np.std(mfcc)
    if(std == 0):
      std = std + 1e-25
    mfcc = (mfcc - mean)/std
    X_new.append(mfcc)
  return np.array(X_new)
X = get_mfcc(X)

(717, 513, 313)
(717,)


In [4]:
# FUNCTIONS TO CONVERT STRING TO ONE-HOT-VECTORS
def one_hot_enc(cl_name):
  if(cl_name == 'music'):
    return 2
  elif(cl_name == 'speech'):
    return 1
  else:
    return 0

def string2ohv(y):
  y_ohv = []
  for i in y:
    y_ohv.append(one_hot_enc(i))
  return np.array(y_ohv)

In [5]:
X_train, y_train, X_test, y_test = splitData(X=X, t=y, testFraction=0.2, randomize=True)
print(X_train.shape)
print(X_test.shape)

(574, 20, 313)
(143, 20, 313)


In [6]:
# RESHAPING THE DATA
def reshape(X, y):
  # Reshape the data to make each entry in the spectrogram a training sample
  X_new = []
  y_new = []
  for i in range(len(X)):
    X_new.append(np.transpose(X[i]))
    # print(X[i].shape[1])
    y_new.append([y[i]]*X[i].shape[1])
    # print(np.shape(y_new))
  X_new = np.array(X_new)
  y_new = np.array(y_new)
  X_new = X_new.reshape((-1, X[0].shape[0]))
  y_new = y_new.reshape((-1, 1))
  print(X_new.shape)
  print(y_new.shape)
  return X_new, y_new
X_train, y_train = reshape(X_train, y_train)
X_test, y_test = reshape(X_test, y_test)
y_test = string2ohv(y_test)
y_train = string2ohv(y_train)
print(y_train.shape)
print(y_test.shape)

(179662, 20)
(179662, 1)
(44759, 20)
(44759, 1)
(179662,)
(44759,)


In [7]:
# PREPARING THE BAG OF FRAMES TO ESTABLISH TEMPORAL DEPENDENCE
def make_bags(X_prev):
  X_new = []
  X_new.append(X_prev[:7])
  X_new.append(X_prev[:7])
  X_new.append(X_prev[:7])

  for i in tqdm(range(len(X_prev)-6)):
    A = X_prev[i:i+7]
    X_new.append(A)

  X_new.append(X_prev[-7:])
  X_new.append(X_prev[-7:])
  X_new.append(X_prev[-7:])
  
  X_new = np.array(X_new)
  return X_new
X_test_reshaped = make_bags(X_test)
print(X_test_reshaped.shape)
X_train_reshaped = make_bags(X_train)
print(X_train_reshaped.shape)

100%|██████████| 44753/44753 [00:00<00:00, 1080493.70it/s]


(44759, 7, 20)


100%|██████████| 179656/179656 [00:00<00:00, 1224844.82it/s]


(179662, 7, 20)


In [8]:
X_train = np.reshape(X_train_reshaped, (np.shape(X_train_reshaped)[0],-1))
X_test = np.reshape(X_test_reshaped, (np.shape(X_test_reshaped)[0],-1))

print(np.shape(X_train), np.shape(X_test))

(179662, 140) (44759, 140)


In [9]:
# for i, label in tqdm(enumerate(y_train)):
#   if label[1] == 1:
#     y_train[i] = 2
#   elif label[0] == 1:
#     y_train[i] = 1
#   else:
#     y_train[i] = 0

# for i, label in tqdm(enumerate(y_test)):
#   if label[1] == 1:
#     y_test[i] = 2
#   elif label[0] == 1:
#     y_test[i] = 1
#   else:
#     y_test[i] = 0

In [10]:
# we'll use first 100 data items of every label, each as the templates
N = 100

templates = np.zeros((3, N, X_train[0].shape[0]))
counts = [0,0,0]

for i in tqdm(range(len(y_train))):
  if counts == [N, N, N]:
    break
  if counts[y_train[i]] < 100:
    templates[y_train[i], counts[y_train[i]]] = X_train[i]
    counts[y_train[i]] += 1

print(np.shape(templates))

  2%|▏         | 2917/179662 [00:00<00:00, 496461.00it/s]

(3, 100, 140)





In [12]:
y_train_pred = np.zeros(len(X_train[:10**4]))
y_test_pred = np.zeros(len(X_test[:10**4]))

for i in tqdm(range(len(X_test[:10**4]))):
  scores = [0,0,0]
  for j in range(3):
    for k in range(N):
      scores[j] += np.linalg.norm(X_test[i]-templates[j][k])
  y_test_pred[i] = np.argmin(scores)


for i in tqdm(range(len(X_train[:10**4]))):
  scores = [0,0,0]
  for j in range(3):
    for k in range(N):
      scores[j] += np.linalg.norm(X_train[i]-templates[j][k])
  y_train_pred[i] = np.argmin(scores)

100%|██████████| 10000/10000 [00:32<00:00, 311.33it/s]
100%|██████████| 10000/10000 [00:31<00:00, 314.76it/s]


In [13]:
train_accuracy = np.mean(y_train_pred[:10**4].ravel() == np.asarray(y_train[:10**4]).ravel()) * 100
print('Train accuracy:', train_accuracy)
test_accuracy = np.mean(y_test_pred[:10**4].ravel() == np.asarray(y_test[:10**4]).ravel()) * 100
print('Test accuracy:', test_accuracy)

Train accuracy: 64.21
Test accuracy: 70.36


In [19]:
import sklearn
print(sklearn.metrics.f1_score(y_test_pred[:10**4], y_test[:10**4], average='macro'))

0.6552829036543976


In [17]:
from collections import Counter
print(Counter(y_train_pred[:10**4]), Counter(y_train[:10**4]))
print(Counter(y_test_pred[:10**4]), Counter(y_test[:10**4]))

Counter({0.0: 4706, 2.0: 4184, 1.0: 1110}) Counter({2: 3756, 1: 3740, 0: 2504})
Counter({0.0: 5782, 2.0: 3162, 1.0: 1056}) Counter({0: 4069, 1: 3443, 2: 2488})
