In [None]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import IPython.display as ipd
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive', force_remount=True)

In [None]:
SAMPLING_RATE=16000
DURATION=10
N_FFT = 1024
WIN_LENGTH = 1024
HOP_LENGTH = 512

In [None]:
X = np.load('/content/drive/MyDrive/val_set/processed_data/combined_spectrogram.npy')
print(X.shape)
y = np.load('/content/drive/MyDrive/val_set/processed_data/y.npy')
print(y.shape)

In [None]:
def one_hot_enc(cl_name):
  if(cl_name == 'music'):
    return np.array([0,1,0])
  elif(cl_name == 'speech'):
    return np.array([1,0,0])
  else:
    return np.array([0,0,1])

In [None]:
def string2ohv(y):
  y_ohv = []
  for i in y:
    y_ohv.append(one_hot_enc(i))
  return np.array(y_ohv)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
print(X_train.shape)
print(X_test.shape)

(609, 513, 313)
(108, 513, 313)


In [None]:
# Reshape the data 
X_train_reshaped = []
for i in range(len(X_train)):
  X_train_reshaped.append(np.transpose(X_train[i]))
X_train_reshaped = np.array(X_train_reshaped)
X_train_reshaped = X_train_reshaped.reshape((-1, 513))
print(X_train_reshaped.shape)

(190617, 513)


In [None]:
y_test = string2ohv(y_test)
y_train = string2ohv(y_train)

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense, Flatten, Dropout, TimeDistributed, LSTM


In [None]:
def get_conv_model():
  model = Sequential()
  model.add(Conv2D(16, (3,3), activation='relu', strides=(1,1),
                   padding='same', input_shape=input_shape))
  model.add(Conv2D(32, (3,3), activation='relu', strides=(1,1),
                   padding='same'))
  model.add(Conv2D(64, (3,3), activation='relu', strides=(1,1),
                   padding='same'))
  model.add(Conv2D(128, (3,3), activation='relu', strides=(1,1),
                   padding='same'))
  model.add(MaxPool2D((2,2)))
  model.add(Dropout(0.5))
  model.add(Flatten())
  model.add(Dense(128, activation='relu'))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(3, activation='softmax'))
  model.summary()
  model.compile(loss='categorical_crossentropy', 
                optimizer='adam', metrics=['accuracy'])
  return model

In [None]:
def get_features(X, feat='mfcc', n_mfcc=20, n_fft=1024, hop_length=512):
  if(feat=='mfcc'):
    X_mfcc = []
    for sample in tqdm(X):
      mfcc_sample = librosa.feature.mfcc(sample, n_mfcc=20, sr=SAMPLING_RATE, n_fft=n_fft, hop_length=hop_length)  
      X_mfcc.append(mfcc_sample)
    X_mfcc = np.array(X_mfcc)
    return X_mfcc
  
  if(feat=='zcr'):
    X_zcr = []
    for sample in tqdm(X):
      zcr_total = librosa.feature.zero_crossing_rate(sample, frame_length=n_fft, hop_length=hop_length)  
      X_zcr.append(zcr_total)
    X_zcr = np.array(X_zcr)
    print(X_zcr.shape)
    return X_zcr

  if(feat=='rms'):
    X_rms = []
    for sample in tqdm(X):
      rms_energy = librosa.feature.rms(sample, frame_length=n_fft, hop_length=hop_length)  
      X_rms.append(rms_energy)
    X_rms = np.array(X_rms)
    print(X_rms.shape)
    return X_rms

In [None]:
def get_rec_model():
  # the shape of the data needs to be (n, time, features)
  model = Sequential()
  model.add(LSTM(128, return_sequences=True, input_shape=input_shape))
  model.add(LSTM(128, return_sequences=True))
  model.add(Dropout(0.5))
  model.add(TimeDistributed(Dense(64, activation='relu')))
  model.add(TimeDistributed(Dense(32, activation='relu')))
  model.add(TimeDistributed(Dense(16, activation='relu')))
  model.add(TimeDistributed(Dense(8, activation='relu')))
  model.add(Flatten())
  model.add(Dense(3, activation='softmax'))
  model.summary()
  model.compile(loss='categorical_crossentropy', 
                optimizer='adam', metrics=['accuracy'])
  return model  

In [None]:
input_shape = (X_train.shape[1], X_train.shape[2], 1)
model = get_conv_model()
model.fit(X_train, y_train, epochs=10, batch_size=32, shuffle=True, validation_data=(X_test, y_test))

In [None]:
def transpose_data(X):
  X_transpose = []
  for s in X:
    n = np.transpose(s)
    X_transpose.append(n)
  return np.array(X_transpose)

In [None]:
X_train = transpose_data(X_train)
X_test = transpose_data(X_test)

In [None]:
input_shape = (X_train.shape[1], X_train.shape[2])
model = get_rec_model()
model.fit(X_train, y_train, epochs=10, batch_size=32, shuffle=True, validation_data=(X_test, y_test))

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 4, 128)            76288     
                                                                 
 lstm_1 (LSTM)               (None, 4, 128)            131584    
                                                                 
 dropout (Dropout)           (None, 4, 128)            0         
                                                                 
 time_distributed (TimeDistr  (None, 4, 64)            8256      
 ibuted)                                                         
                                                                 
 time_distributed_1 (TimeDis  (None, 4, 32)            2080      
 tributed)                                                       
                                                                 
 time_distributed_2 (TimeDis  (None, 4, 16)           

<keras.callbacks.History at 0x7fc2e095dfd0>

In [None]:
# def consensusDecoder(y, spec):
#     n_fft = 1024
#     hop_length = 512
#     win_length = 1024
#     # X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = np.complex256))
#     # X = librosa.power_to_db(X**2,ref=np.max)

#     n_overlap = win_length//hop_length 
#     # every 3 frames of spec, give one label in y, 0.1-0.2, 0.15-0.25, 0.25-0.35, resolution is 0.1 seconds
#     # for every 0.1 seconds instance we will take consensus of predictions involved

