In [3]:
import pandas as pd
import os
import numpy as np 
import math, random
import torch
# import torchaudio
from torchaudio import transforms
import streamlit as st
from IPython.display import Audio

ImportError: cannot import name 'builder' from 'google.protobuf.internal' (c:\users\siddhant\pycharmprojects\siddhant_python\venv\lib\site-packages\google\protobuf\internal\__init__.py)

In [None]:
sitar_files = r"C:\Users\Siddhant\Downloads\Musical instruments\Sitar\wav"
violin_files = r"C:\Users\Siddhant\Downloads\Musical instruments\Violin"
mohanveena_files = r"C:\Users\Siddhant\Downloads\Musical instruments\Mohan veena\.wav"

# Pre-Processing

In [None]:
class AudioProcessing():

  """
  open method is used to load the audio file and returns your signal as a Tensor and sampling rate
  Tensors are like multi-dimensional arrays with a uniform type
  """

  @staticmethod
  def open(audio_file_path):
    data, sampling_rate = torchaudio.load(audio_file_path)
    return (data, sampling_rate)
         

  """
  rechannel method: signals can either be mono or stereo. This method is used to get all our signals in the same dimensions.
  It converts all mono signals to stereo by duplicating the first channel
  Link for difference between mono/stereo : https://www.rowkin.com/blogs/rowkin/mono-vs-stereo-sound-whats-the-big-difference 
  """
#channels stereo/mono
  @staticmethod
  def rechannel(audio_file, new_channel):
    data, sampling_rate = audio_file

    if (data.shape[0] == new_channel):
      return audio_file

    if (new_channel == 1):
      # stereo to mono
      resig = data[:1, :]
    else:
      # mono to stereo by duplicating
      resig = torch.cat([data, data])

    return ((resig, sampling_rate))

  """
  resampling method: our audio signals have different sampling rates as well. Hence, We need to standardise the sampling rate.
  Different sampling rates result in different array sizes. Ex: sr - 40000Hz means array size of 400000 whereas 40010Hz means aaray size of 40010
  After standardisation we get all arrays of the same size
  """
#resample one at a time and merge
  @staticmethod
  def resample(audio, new_sampling_rate):
    data, sampling_rate = audio

    if (sampling_rate == new_sampling_rate):
      return audio

    num_channels = data.shape[0]
    resig = torchaudio.transforms.Resample(sampling_rate, new_sampling_rate)(data[:1,:])
    if (num_channels > 1):
      retwo = torchaudio.transforms.Resample(sampling_rate, new_sampling_rate)(data[1:,:])
      resig = torch.cat([resig, retwo])

    return ((resig, new_sampling_rate))

  """
  pad_trunc method: Our audio files are bound to be of different lengths of time. This also needs to be standardised.
  This method either extends the length by padding with silence (Zero Padding) or reduces the length by truncating
  """
  @staticmethod
  def pad_trunc(audio, max_ms):
    data, sampling_rate = audio
    num_rows, data_len = data.shape
    max_len = sampling_rate//1000 * max_ms

    if (data_len > max_len): 
      # truncate to given length
      data = data[:,:max_len]

    elif (data_len < max_len):
      # padding at the start and end of the audio
      pad_begin_len = random.randint(0, max_len - data_len) #fill with random no between at 0 upto the extra time(maxlen-datalen)
      pad_end_len = max_len - data_len - pad_begin_len

      # Pad with 0s - Zero Padding
      pad_begin = torch.zeros((num_rows, pad_begin_len))
      pad_end = torch.zeros((num_rows, pad_end_len))

      data = torch.cat((pad_begin, data, pad_end), 1)
      
    return (data, sampling_rate)

 
  # Spectrogram finally!!!
  """
  spectrogram method: 
  Link for short explanation: https://colab.research.google.com/drive/1UgxygdrBfq7UGjhTCc9oupA-CyKFGhGa#scrollTo=733XclBe9Vgn
  """
  @staticmethod
  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80

    # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

    # Convert to decibels
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)

  

In [None]:
def data_processing(folder):
    
  spectrograms = []
  new_channel = 2                   #making all stereo sounds
  new_sampling_rate = 44100         #permanently setting a standard rate
  duration = 6000                   #setting a standard audio length of 6s, 6000ms

  # looping over every files in the folder of musical instrument for ex: sitar
  for i in os.listdir(folder):
  
    audio = AudioProcessing.open(os.path.join(folder,i))
    resampled_audio = AudioProcessing.resample(audio, new_sampling_rate)
    rechanneled_audio = AudioProcessing.rechannel(resampled_audio, new_channel)
    padded_audio = AudioProcessing.pad_trunc(rechanneled_audio, duration)
    spectro_gram = AudioProcessing.spectro_gram(padded_audio, n_mels=64, n_fft=1024, hop_len=None)
    spectrograms.append(spectro_gram) 
  return spectrograms


# def data_processing_st(folder):
#     spectrograms = []
#     new_channel = 2  # making all stereo sounds
#     new_sampling_rate = 44100  # permanently setting a standard rate
#     duration = 6000  # setting a standard audio length of 6s, 6000ms
#
#     # looping over every files in the folder of musical instrument for ex: sitar
#     for i in os.listdir(folder):
#         audio = AudioProcessing.open(os.path.join(folder, i))
#         resampled_audio = AudioProcessing.resample(audio, new_sampling_rate)
#         rechanneled_audio = AudioProcessing.rechannel(resampled_audio, new_channel)
#         padded_audio = AudioProcessing.pad_trunc(rechanneled_audio, duration)
#         spectro_gram = AudioProcessing.spectro_gram(padded_audio, n_mels=64, n_fft=1024, hop_len=None)
#         spectrograms.append(spectro_gram)
#     return spectrograms


In [None]:
violin_spectrograms  = data_processing(violin_files) #cls id = 0 
violin_arr_list = []
for i in violin_spectrograms:
  arr = i.numpy()
  violin_arr_list.append(arr)
violin_arr = np.array(violin_arr_list)
print(violin_arr.shape)


mohanveena_spectrograms  = data_processing(mohanveena_files) #cls id = 1 
mohanveena_arr_list = []
for i in mohanveena_spectrograms:
  arr = i.numpy()
  mohanveena_arr_list.append(arr)
mohanveena_arr = np.array(mohanveena_arr_list)
print(mohanveena_arr.shape)


sitar_spectrograms  = data_processing(sitar_files) #cls id = 2
sitar_arr_list = []
for i in sitar_spectrograms:
  arr = i.numpy()
  sitar_arr_list.append(arr)
sitar_arr = np.array(sitar_arr_list)
print(sitar_arr.shape)

In [None]:
y1 = np.zeros(6) 
y2 = np.ones(10)
y3 = np.full(10,2)
print(y1.shape)
print(y2.shape)
print(y3.shape)
y = np.concatenate((y1,y2,y3), axis=0)
print(y.shape)
y = y.reshape(26,1)
print(y.shape)

In [None]:
x = np.concatenate((violin_arr,mohanveena_arr,sitar_arr),axis=0)
print(x.shape)

# Artificial Neural Networks

In [None]:
from keras import models, layers

In [None]:
network_model = models.Sequential()
network_model.add(layers.Dense(512, activation="leaky_relu", input_shape=(2*64*516,)))
network_model.add(layers.Dense(128, activation="relu", input_shape=(2*64*516,)))
network_model.add(layers.Dense(3, activation="softmax"))


In [None]:
network_model.summary()

In [None]:
network_model.compile(optimizer="adam", metrics=["accuracy"], loss="categorical_crossentropy")

In [None]:
x = x.reshape(26, 2*64*516)
x = x.astype(float)/255 #standardisation - line starts from origin 
print(x.shape)
print(y.shape)

In [None]:
y


In [None]:
##preprocessing the labels data
from keras.utils.np_utils import to_categorical

#one hot encoding
y = to_categorical(y)

In [None]:
y

In [None]:
print(x.shape) #m, nx
print(y.shape) #m, no of classes

In [None]:
network_model.fit(x, y, epochs=15)

In [None]:
check_dir = r"C:\Users\Siddhant\Downloads\Musical instruments\checking"
check_spectrograms  = data_processing(check_dir) #cls id = 0
check_arr_list = []
for i in check_spectrograms:
  arr = i.numpy()
  check_arr_list.append(arr)
check_arr = np.array(check_arr_list)
print(check_arr.shape)



