In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import pandas as pd
import os
import numpy as np 
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

In [7]:
sitar_files = "/content/drive/MyDrive/Musical instruments/Sitar/wav"
violin_files = "/content/drive/MyDrive/Musical instruments/Violin"
mohanveena_files = "/content/drive/MyDrive/Musical instruments/Mohan veena/.wav"

# Pre-Processing

In [8]:
class AudioProcessing():

  """
  open method is used to load the audio file and returns your signal as a Tensor and sampling rate
  Tensors are like multi-dimensional arrays with a uniform type
  """

  @staticmethod
  def open(audio_file_path):
    data, sampling_rate = torchaudio.load(audio_file_path)
    return (data, sampling_rate)
         

  """
  rechannel method: signals can either be mono or stereo. This method is used to get all our signals in the same dimensions.
  It converts all mono signals to stereo by duplicating the first channel
  Link for difference between mono/stereo : https://www.rowkin.com/blogs/rowkin/mono-vs-stereo-sound-whats-the-big-difference 
  """
#channels stereo/mono
  @staticmethod
  def rechannel(audio_file, new_channel):
    data, sampling_rate = audio_file

    if (data.shape[0] == new_channel):
      return audio_file

    if (new_channel == 1):
      # stereo to mono
      resig = data[:1, :]
    else:
      # mono to stereo by duplicating
      resig = torch.cat([data, data])

    return ((resig, sampling_rate))

  """
  resampling method: our audio signals have different sampling rates as well. Hence, We need to standardise the sampling rate.
  Different sampling rates result in different array sizes. Ex: sr - 40000Hz means array size of 400000 whereas 40010Hz means aaray size of 40010
  After standardisation we get all arrays of the same size
  """
#resample one at a time and merge
  @staticmethod
  def resample(audio, new_sampling_rate):
    data, sampling_rate = audio

    if (sampling_rate == new_sampling_rate):
      return audio

    num_channels = data.shape[0]
    resig = torchaudio.transforms.Resample(sampling_rate, new_sampling_rate)(data[:1,:])
    if (num_channels > 1):
      retwo = torchaudio.transforms.Resample(sampling_rate, new_sampling_rate)(data[1:,:])
      resig = torch.cat([resig, retwo])

    return ((resig, new_sampling_rate))

  """
  pad_trunc method: Our audio files are bound to be of different lengths of time. This also needs to be standardised.
  This method either extends the length by padding with silence (Zero Padding) or reduces the length by truncating
  """
  @staticmethod
  def pad_trunc(audio, max_ms):
    data, sampling_rate = audio
    num_rows, data_len = data.shape
    max_len = sampling_rate//1000 * max_ms

    if (data_len > max_len): 
      # truncate to given length
      data = data[:,:max_len]

    elif (data_len < max_len):
      # padding at the start and end of the audio
      pad_begin_len = random.randint(0, max_len - data_len) #fill with random no between at 0 upto the extra time(maxlen-datalen)
      pad_end_len = max_len - data_len - pad_begin_len

      # Pad with 0s - Zero Padding
      pad_begin = torch.zeros((num_rows, pad_begin_len))
      pad_end = torch.zeros((num_rows, pad_end_len))

      data = torch.cat((pad_begin, data, pad_end), 1)
      
    return (data, sampling_rate)

 
  # Spectrogram finally!!!
  """
  spectrogram method: 
  Link for short explanation: https://colab.research.google.com/drive/1UgxygdrBfq7UGjhTCc9oupA-CyKFGhGa#scrollTo=733XclBe9Vgn
  """
  @staticmethod
  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80

    # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

    # Convert to decibels
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)

  

In [9]:
def data_processing(folder):
    
  spectrograms = []
  new_channel = 2                   #making all stereo sounds
  new_sampling_rate = 44100         #permanently setting a standard rate
  duration = 6000                   #setting a standard audio length of 6s, 6000ms

  # looping over every files in the folder of musical instrument for ex: sitar
  for i in os.listdir(folder):
  
    audio = AudioProcessing.open(os.path.join(folder,i))
    resampled_audio = AudioProcessing.resample(audio, new_sampling_rate)
    rechanneled_audio = AudioProcessing.rechannel(resampled_audio, new_channel)
    padded_audio = AudioProcessing.pad_trunc(rechanneled_audio, duration)
    spectro_gram = AudioProcessing.spectro_gram(padded_audio, n_mels=64, n_fft=1024, hop_len=None)
    spectrograms.append(spectro_gram) 
  return spectrograms




In [10]:
violin_spectrograms  = data_processing(violin_files) #cls id = 0 
violin_arr_list = []
for i in violin_spectrograms:
  arr = i.numpy()
  violin_arr_list.append(arr)
violin_arr = np.array(violin_arr_list)
print(violin_arr.shape)


mohanveena_spectrograms  = data_processing(mohanveena_files) #cls id = 1 
mohanveena_arr_list = []
for i in mohanveena_spectrograms:
  arr = i.numpy()
  mohanveena_arr_list.append(arr)
mohanveena_arr = np.array(mohanveena_arr_list)
print(mohanveena_arr.shape)


sitar_spectrograms  = data_processing(sitar_files) #cls id = 2
sitar_arr_list = []
for i in sitar_spectrograms:
  arr = i.numpy()
  sitar_arr_list.append(arr)
sitar_arr = np.array(sitar_arr_list)
print(sitar_arr.shape)

(6, 2, 64, 516)
(10, 2, 64, 516)
(10, 2, 64, 516)


In [11]:
y1 = np.zeros(6) 
y2 = np.ones(10)
y3 = np.full(10,2)
print(y1.shape)
print(y2.shape)
print(y3.shape)
y = np.concatenate((y1,y2,y3), axis=0)
print(y.shape)
y = y.reshape(26,1)
print(y.shape)

(6,)
(10,)
(10,)
(26,)
(26, 1)


In [12]:
x = np.concatenate((violin_arr,mohanveena_arr,sitar_arr),axis=0)
print(x.shape)

(26, 2, 64, 516)


# Artificial Neural Networks

In [1]:
from keras import models, layers

In [2]:
network_model = models.Sequential()
network_model.add(layers.Dense(512, activation="leaky_relu", input_shape=(2*64*516,)))
network_model.add(layers.Dense(128, activation="relu", input_shape=(2*64*516,)))
network_model.add(layers.Dense(3, activation="softmax"))


In [3]:
network_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               33817088  
                                                                 
 dense_1 (Dense)             (None, 128)               65664     
                                                                 
 dense_2 (Dense)             (None, 3)                 387       
                                                                 
Total params: 33,883,139
Trainable params: 33,883,139
Non-trainable params: 0
_________________________________________________________________


In [4]:
network_model.compile(optimizer="adam", metrics=["accuracy"], loss="categorical_crossentropy")

In [13]:
x = x.reshape(26, 2*64*516)
x = x.astype(float)/255 #standardisation - line starts from origin 
print(x.shape)
print(y.shape)

(26, 66048)
(26, 1)


In [14]:
y


array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.]])

In [15]:
##preprocessing the labels data
from tensorflow.keras.utils import to_categorical

#one hot encoding
y = to_categorical(y)


In [16]:
y

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [17]:
print(x.shape) #m, nx
print(y.shape) #m, no of classes

(26, 66048)
(26, 3)


In [18]:
network_model.fit(x, y, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fbaa1f693d0>

# LDA


In [19]:
import sklearn
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda

In [20]:
print(x.shape)   #ML - m, nx format 

(26, 66048)


In [21]:
y_1 = np.zeros(6)
y_2 = np.ones(10)
y_3 = np.full(10,2)
print(y_1.shape)
print(y_2.shape)
print(y_3.shape)
y_ = np.concatenate((y_1,y_2,y_3), axis=0)
print(y_.shape)
y_ = y_.reshape(26,1)
print(y_.shape)

(6,)
(10,)
(10,)
(26,)
(26, 1)


In [22]:
x_df = pd.DataFrame(x)
# x_df.head()

y_df = pd.DataFrame(y_)
# y_df.head()

In [23]:
x_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66038,66039,66040,66041,66042,66043,66044,66045,66046,66047
0,0.025032,-0.090484,-0.087138,-0.092207,-0.076259,-0.079505,-0.069672,-0.117736,-0.098156,-0.097636,...,-0.155297,-0.155297,-0.155297,-0.155297,-0.155297,-0.155297,-0.155297,-0.155297,-0.155297,-0.147608
1,-0.19303,-0.19303,-0.19303,-0.19303,-0.19303,-0.19303,-0.19303,-0.19303,-0.19303,-0.19303,...,-0.19303,-0.19303,-0.19303,-0.19303,-0.19303,-0.19303,-0.19303,-0.19303,-0.19303,-0.19303
2,0.006038,-0.081756,-0.091666,-0.074198,-0.064413,-0.111878,-0.10696,-0.085069,-0.075285,-0.041502,...,-0.164481,-0.164481,-0.164481,-0.164481,-0.164481,-0.164481,-0.164481,-0.164481,-0.164481,-0.160583
3,0.058776,-0.037989,-0.061371,-0.104502,-0.101425,-0.044822,-0.016779,-0.032939,-0.059585,-0.055484,...,-0.150176,-0.150176,-0.150176,-0.150176,-0.150176,-0.150176,-0.150176,-0.150176,-0.150176,-0.150176
4,0.073489,-0.068629,-0.057619,-0.117497,-0.046673,-0.052807,-0.0563,-0.100291,-0.044439,-0.036422,...,-0.162598,-0.162598,-0.162598,-0.162598,-0.162598,-0.162598,-0.162598,-0.162598,-0.162598,-0.162598


In [24]:
y_df.head()

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [25]:
LDA = lda(n_components =2)   # n-1 == 3-1 == 2

In [26]:
LDA.fit(x_df,y_df)

  y = column_or_1d(y, warn=True)


LinearDiscriminantAnalysis(n_components=2)

In [27]:
z = LDA.transform(x_df)  #Z-score
z

array([[-3.33816195, -2.41249905],
       [-0.62328577,  0.33626046],
       [-2.71229053, -1.24789056],
       [-2.24239775, -0.83737061],
       [-3.11512712, -1.81193106],
       [-2.56189371, -2.42162095],
       [ 3.39676031, -2.13255301],
       [ 0.91093259,  0.01754986],
       [ 2.67469601, -0.66339281],
       [ 2.19006258, -0.05702   ],
       [ 1.78032228, -0.80694315],
       [ 3.74101947, -0.3578741 ],
       [ 1.41378003,  0.91758456],
       [ 2.8523143 , -0.36189119],
       [ 3.38832241, -0.80677825],
       [ 0.82825465,  0.091133  ],
       [ 0.27536734,  2.05214217],
       [-2.80806724,  1.00202661],
       [-0.92728977,  1.2421681 ],
       [-0.51307465,  2.30581706],
       [-0.95566784,  0.11994427],
       [ 0.73158772,  0.92390556],
       [-1.2699029 ,  3.33458911],
       [-0.88997549,  0.01887805],
       [-0.80344123, -0.27694821],
       [-1.42284374,  1.83271413]])

In [28]:
y_pred = LDA.predict(x_df)
y_pred

array([0., 2., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2.])

In [29]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [30]:
cm = confusion_matrix(y_df,y_pred)
print(cm)

[[ 5  0  1]
 [ 0 10  0]
 [ 0  0 10]]


LDA before PCA

In [31]:
accuracy_score(y_df ,y_pred)    # 25/26 = 0.96

0.9615384615384616

# PCA


In [32]:
from sklearn.decomposition import PCA

In [33]:
pca = PCA(n_components=1)
pca.fit(x_df)

print(f"pca.components_:\n{pca.components_}")  #eigen vector
print(f"\n\npca.explained_variance_:\n{pca.explained_variance_}")    #eigen value
print(f"\n\npca.explained_variance_ratio_:\n{pca.explained_variance_ratio_}") #percentage of variabilty 
z_scores = pca.transform(x_df)
print(f"\n\nz_scores:\n{z_scores}")   #Z-Score

pca.components_:
[[-0.0082795  -0.00485313 -0.00452496 ... -0.001362   -0.001362
  -0.00182884]]


pca.explained_variance_:
[62.78468513]


pca.explained_variance_ratio_:
[0.33922695]


z_scores:
[[-11.24696588]
 [  4.36378769]
 [-10.05817279]
 [ -9.42454166]
 [-12.8574765 ]
 [-12.74985098]
 [ 11.44773055]
 [  7.55199398]
 [ -0.05496496]
 [  2.24821666]
 [ -3.0938237 ]
 [  9.4383823 ]
 [  0.03503697]
 [ 13.84573003]
 [ 15.42497386]
 [  9.16685155]
 [ -5.61914167]
 [ -4.41007906]
 [ -1.21783718]
 [  0.87017519]
 [  3.88262139]
 [ -0.33693644]
 [ -2.83659013]
 [ -0.42088741]
 [ -4.82773598]
 [  0.87950417]]


In [34]:
new_x = pd.DataFrame(z_scores)
new_x

Unnamed: 0,0
0,-11.246966
1,4.363788
2,-10.058173
3,-9.424542
4,-12.857476
5,-12.749851
6,11.447731
7,7.551994
8,-0.054965
9,2.248217


LDA after PCA 

In [35]:
new_x.shape

(26, 1)

In [36]:
LDA = lda(n_components =1)
LDA.fit(new_x,y_)
z = LDA.transform(new_x)
print(z)
y_pred = LDA.predict(new_x)
print(y_pred)

[[-2.09325752]
 [ 0.81217739]
 [-1.87200228]
 [-1.75407242]
 [-2.39300178]
 [-2.37297078]
 [ 2.13062334]
 [ 1.40555847]
 [-0.01022994]
 [ 0.41843253]
 [-0.57581483]
 [ 1.75664841]
 [ 0.00652099]
 [ 2.57693309]
 [ 2.87085805]
 [ 1.70611178]
 [-1.04582077]
 [-0.82079303]
 [-0.22666085]
 [ 0.16195486]
 [ 0.72262391]
 [-0.06270978]
 [-0.52793915]
 [-0.07833453]
 [-0.8985263 ]
 [ 0.16369114]]
[0. 1. 0. 0. 0. 0. 1. 1. 2. 2. 2. 1. 2. 1. 1. 1. 2. 2. 2. 2. 1. 2. 2. 2.
 2. 2.]


  y = column_or_1d(y, warn=True)


In [37]:
cm = confusion_matrix(y_,y_pred)
cm

array([[5, 1, 0],
       [0, 6, 4],
       [0, 1, 9]])

In [38]:
accuracy_score(y_ ,y_pred)

0.7692307692307693