In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/6june-210/bheem3.wav
/kaggle/input/6june-210/thi1.wav
/kaggle/input/6june-210/tha2.wav
/kaggle/input/6june-210/thom1.wav
/kaggle/input/6june-210/thi3.wav
/kaggle/input/6june-210/bheem1-2.wav
/kaggle/input/6june-210/thom2.wav
/kaggle/input/6june-210/chappu3.wav
/kaggle/input/6june-210/chappu1.wav
/kaggle/input/6june-210/tha_thi_thom_num2.wav
/kaggle/input/6june-210/chappu4.wav
/kaggle/input/6june-210/thi2.wav
/kaggle/input/6june-210/ta2.wav
/kaggle/input/6june-210/ki3.wav
/kaggle/input/6june-210/ta1.wav
/kaggle/input/6june-210/dhin1.wav
/kaggle/input/6june-210/tha_thi_thom_num_4.wav
/kaggle/input/6june-210/ki2.wav
/kaggle/input/6june-210/bheem1.wav
/kaggle/input/6june-210/thom3.wav
/kaggle/input/6june-210/tha_thi_thom_num_3.wav
/kaggle/input/6june-210/tha_thi_thom_num_2_2.wav
/kaggle/input/6june-210/bheem2.wav
/kaggle/input/6june-210/dhin2.wav
/kaggle/input/6june-210/tha_thi_thom_num1.wav
/kaggle/input/6june-210/ki1.wav
/kaggle/input/6june-210/dheem2.wav
/kaggle/input/6jun

In [2]:
!pip install mirdata
!pip install noisereduce

Collecting mirdata
  Downloading mirdata-0.3.7-py3-none-any.whl (14.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.9/14.9 MB[0m [31m87.6 MB/s[0m eta [36m0:00:00[0m
Collecting jams (from mirdata)
  Downloading jams-0.3.4.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ | done
Collecting pretty-midi>=0.2.8 (from mirdata)
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m100.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting chardet (from mirdata)
  Downloading chardet-5.1.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting mido>=1.1.16 (from pretty-midi>=0.2.8-

In [3]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# plt.imshow(mpimg.imread("/kaggle/input/mridangamdataset/spectrogram.png"))

In [4]:
# Imports are made in MuSCl Preprocessor's __init__ function
from essentia.standard import Windowing, OnsetDetection,FFT,CartesianToPolar,FrameGenerator,Onsets,AudioOnsetsMarker,StereoMuxer, OnsetDetectionGlobal
from tempfile import TemporaryDirectory
import essentia
import mirdata
import pandas as pd
import librosa
import essentia.standard as estd
import numpy as np
import noisereduce as nr

[   INFO   ] MusicExtractorSVM: no classifier models were configured by default


In [5]:
class MuSClPreprocessor:
    '''
    Preprocessor class for MuSCl or any other music signal,
    check the documentation of each function for usage
    
    Size of the output signal depends on the size of the input file in milli seconds related to the sampling rate applied. 
    The basic preprocessor function does not change the size of the input file.
    '''
    def __init__(self):
        try:
            from essentia.standard import Windowing, OnsetDetection,FFT,CartesianToPolar,FrameGenerator,Onsets,AudioOnsetsMarker,StereoMuxer, OnsetDetectionGlobal
            from tempfile import TemporaryDirectory
            import essentia
            import mirdata
            import pandas as pd
            import librosa
            import essentia.standard as estd
            import numpy as np
            import noisereduce as nr
        except Exception as e:
            print("Terminated, As the following error was raised",e)
            
    def BasicPreprocessor(self,filename=None,audio=None,inputsamplerate=44100,noise_remove=False,sr=44100,normalize_type='librosa',target_peak=0.9,target_rms=0.5):
        '''
        Does Resampling,Noise Removal, Normalization and Framing
        Parameters:
        
        Resampling rate is generally 44100, can be changed as an input
        sr: Resampling rate
        normalize_type: Type of normalization options: librosa | peak-normalize | rms
            for peak-normalize option, target_peak is preferred to be 0.9
            for rms-normalization option, target_rms is preferred to be 0.5
        target_peak: value for peak-normalize option, if normalize_type == 'peak-normalize'
        target_rms: value for rms-normalization option if normalize_type == 'rms'
        
        Returns audio_signal
        '''
        # Resampling part
        if audio==None and filename==None:
            print("Error, Please give any one of the parameters as input - audio or filename")
            return
        if audio==None:
            audio,inputsamplerate=librosa.load(filename,sr=sr,res_type='HQ')
            
        
        # can do resampling by giving alternate sr, default is 44100,
        # res_type is a parameter 'HQ' can be given for high quality
        # returns single array output calling for the amplitude values at 
        # each point of time for a mono audio signal.
        
        # Normalization part
        if normalize_type=='librosa':
            audio=librosa.util.normalize(audio)
        elif normalize_type=='peak-normalize':
            max_amplitude = np.max(np.abs(audio))
            audio = audio * (target_peak / max_amplitude)
        elif normalize_type=='rms':
            rms = np.sqrt(np.mean(audio**2))
            audio = audio * (target_rms / rms)
        
        # noise-removal part
        if noise_remove:
            audio=nr.reduce_noise(audio,sr=sr)
        
        return audio
    
    def apply_window(self,audio_signal, window_type='hamming'):
        '''
        Applying windowing functions
        Parameters:
        audio_signal: vector for audio signal
            librosa.load output or estd.MonoLoader().compute() output can be an input
        window_type: hann | hamming | rectangular | blackman
        
        '''
        windows = {
            'rectangular': np.ones_like(audio_signal),
            'hann': np.hanning(len(audio_signal)),
            'hamming': np.hamming(len(audio_signal)),
            'blackman': np.blackman(len(audio_signal))
        }
        window = windows.get(window_type, np.ones_like(audio_signal))
        return audio_signal * window

    def frame_signal(self,signal, frame_size=1024, hop_size=512):
        '''
        Function to generate frames
        Parameters:
        signal: vector[real] 
        frame_size: Size of each in samples
        hop_size: Hop size between consecutive frames in samples
        '''
        num_frames = 1 + int((len(signal) - frame_size) / hop_size)
        frames = np.zeros((num_frames, frame_size))
        for i in range(num_frames):
            start = i * hop_size
            end = start + frame_size
            frames[i] = signal[start:end]
        return frames
    
    def compute_cqt(self,audio_signal,sr,n_bins=84,hop_length=512,n_frames=100):
        '''
        Parameters:
        audio_file: music file in .wav format 
        sr: sample_rate of the audio signal
        n_bins: required number of bins
        hop_length: hop length in the signal
        n_frames: number of frames

        returns:
        CQ-Transform of the input, np.ndarray
        '''

        cqt = librosa.cqt(audio_signal, sr=sr, n_bins=n_bins, hop_length=hop_length, bins_per_octave=12)
        return cqt
    
    def onset_detection(self,filename=None,audio=None,sr=44100,complex=True,hfc=False):
        '''
        This function detects onsets in the file using either of the complex or simple Onset Detection functions
        from essentia.standard. All the required imports should be done to use this file
        
        Parameters:
        filename: default=None. the location of the audio file for onset detection. Flexible, can provide audio vector instead
        audio: default=None. audio signal of the music file. Flexible, can give filename instead. But one of them has to be give.
        sr: deafult=44100. sample rate of the input
        complex: default=True. If True, complex Onset Detection Function type from essentia.standard.OnsetDetection shall be used
        hfc: default=False. If True, Simple Onset Detection Function type from essentia.standard.OnsetDetection shall be used
        
        Returns:
        A list of lists and MUX audio signal, each list denoting the starting and ending indices of frames of an audio file where the onsets were detected.
        If a single onset is detected the output is a list of a single list and the audio MUX file.
        The MUX file is a (n,2) array where the first column indicates the actual signal vector values and the
        second column indicates where the onsets were detected.
        '''
        
        if audio==None and filename==None:
            print("Error, provide atleast one of the parameters - filename, audio")
            return [[]]
        
        if audio==None:
            audio=librosa.load(filename,sr=44100)[0] # change this to a parameter input 
        
        # We need the auxilary algorithms to compute magnitude and phase.
        w = Windowing(type='hann')
        fft = FFT() # Outputs a complex FFT vector.
        c2p = CartesianToPolar() # Converts it into a pair of magnitude and phase vectors.
        # Compute both ODF frame by frame. Store results to a Pool.
        pool = essentia.Pool()
        # Add onset markers to the audio and save it to a file.
        # We use beeps instead of white noise and stereo signal as it's more distinctive.
        onsets=Onsets()
        # We want to keep beeps in a separate audio channel.
        # Add them to a silent audio and use the original audio as another channel. Mux both into a stereo signal.

        silence = [0.] * len(audio)
        if complex:
            od_complex = OnsetDetection(method='complex')
            for frame in FrameGenerator(audio, frameSize=1024, hopSize=512):
                magnitude, phase = c2p(fft(w(frame)))
                pool.add('odf.complex', od_complex(magnitude, phase))
            onsets_complex = onsets(essentia.array([pool['odf.complex']]), [1])
            beeps_complex = AudioOnsetsMarker(onsets=onsets_complex, type='beep')(silence)
            audio_ = StereoMuxer()(audio, beeps_complex)
            indices=np.where(audio_[:,1]!=0)

        else:
            for frame in FrameGenerator(audio, frameSize=1024, hopSize=512):
                magnitude, phase = c2p(fft(w(frame)))
                pool.add('odf.hfc', od_hfc(magnitude, phase))
            onsets_hfc = onsets(# This algorithm expects a matrix, not a vector.
                                essentia.array([pool['odf.hfc']]),
                                # You need to specify weights, but if we use only one ODF
                                # it doesn't actually matter which weight to give it
                                [1])
            od_hfc=OnsetDetection(method='hfc')
            beeps_hfc = AudioOnsetsMarker(onsets=onsets_hfc, type='beep')(silence)
            audio_ = StereoMuxer()(audio, beeps_hfc)
            indices=np.where(audio_[:,1]!=0)

        if ((max(indices[0])-min(indices[0]))+1)==indices[0].shape:
            return [[min(indices[0]),max(indices[0])]],audio_
        else:
            l=[]
            l.append([indices[0][0],])
            for i in range(1,len(indices[0])):
                if indices[0][i]!=(indices[0][i-1]+1):
                    l[-1].append(indices[0][i-1])
                    l.append([indices[0][i],])
            l[-1].append(indices[0][-1])
            return l,audio_

In [6]:
# Music Stroke Classification class written by using compiam, other research papers and dataset from mirdata
class MuSCl:
    def __init__(self):
        try:
            from essentia.standard import Windowing, OnsetDetection,FFT,CartesianToPolar,FrameGenerator,Onsets,AudioOnsetsMarker,StereoMuxer, OnsetDetectionGlobal
            from tempfile import TemporaryDirectory
            import essentia
            import mirdata
            import pandas as pd
            import librosa
            import essentia.standard as estd
            import numpy as np
            import noisereduce as nr
        except Exception as e:
            print("Terminated, As the following error was raised",e)
        self.dataset=None
        self.dataset_loaded=False
    
    def load_dataset(self,dataset_name="mridangam_stroke",version="default",data_home=None,download=True):
        '''Load_dataset function to download the dataset and load it as an class object
        
        Parameters:
        dataset_name : Dataset name according to MIR datasets
        version : Version type of the dataset required
        data_home : if dataset is already downloaded, just import from data_home or want to downlad to a particular location in directory can give the location
        download : True, dataset is downloaded from mirdata. False, dataset is already downloaded
        
        returns None, loads the downloaded dataset in variables
        '''
        self.dataset=mirdata.initialize(
                        dataset_name=dataset_name, data_home=data_home, version=version
                            )
        self.data_home = self.dataset.data_home # where the data is stored after downloading
        if download:
            self.dataset.download()
            self.dataset.validate()
        else:
            if not os.path.exists(os.path.join(self.data_home, "mridangam_stroke_1.5")):
                raise ValueError(
                    "Dataset not found, please re-run load_dataset with download=True"
                )
        if dataset_name=="mridangam_stroke":
            self.dataset_loaded=True
            self.mridangam_ids = self.dataset.track_ids  # Load Mridangam IDs
            self.mridangam_tracks = self.dataset.load_tracks()  # Load Mridangam data
            self.stroke_names = self.get_strokes()
            self.stroke_dict = {item: [] for item in self.stroke_names}
            for i in self.mridangam_ids:
                self.stroke_dict[self.mridangam_tracks[i].stroke_name].append(
                    self.mridangam_tracks[i].audio_path
                    )
            
            
    def get_strokes(self):
        """List available mridangam strokes in the dataset.

        :returns: list of strokes in the datasets.
        """
        if not self.dataset_loaded:
            raise ValueError('Dataset Not Loaded') # Change according to convenience
        stroke_names = []
        for i in self.mridangam_ids:
            stroke_names.append(self.mridangam_tracks[i].stroke_name)
        return list(np.unique(stroke_names))
    def dict_strokes(self):
        """List and convert to indexed dict the available mridangam strokes in the dataset.

        :returns: dict with strokes as values and unique integer as keys.
        """
        if not self.dataset_loaded:
            raise ValueError('Dataset Not Loaded') # Change according to convenience
        stroke_names = []
        for i in self.mridangam_ids:
            stroke_names.append(self.mridangam_tracks[i].stroke_name)
        stroke_names = np.unique(stroke_names)
        return {idx: x for idx, x in enumerate(stroke_names)}
    
    

In [7]:
classifier=MuSCl()
classifier.load_dataset("mridangam_stroke",download=True) #If first time, change to True

124MB [00:06, 19.7MB/s]                           
100%|██████████| 6976/6976 [00:00<00:00, 7614.80it/s]


In [8]:
import os
#os.listdir("/root/mir_datasets/mridangam_stroke/mridangam_stroke_1.5/E/")
#os.listdir("/kaggle/input/6june-210/")

In [9]:
import essentia.standard as estd

filename=r"/root/mir_datasets/mridangam_stroke/mridangam_stroke_1.5/E/230381__akshaylaya__dhin-e-112.wav"
preprocessor=MuSClPreprocessor()
val=preprocessor.BasicPreprocessor(filename,noise_remove=True,normalize_type='librosa',target_peak=0.8)
feats=estd.Extractor()(preprocessor.apply_window(val,'hamming'))
for name in feats.descriptorNames():
    try:
        if len(feats[name])>1:
            try:
                print(name,(len(feats[name]),len(feats[name][0])))
            except:
                print(name,len(feats[name]),(len(feats[name]),))
    except:
        pass

lowLevel.barkbands_kurtosis 23 (23,)
lowLevel.barkbands_skewness 23 (23,)
lowLevel.barkbands_spread 23 (23,)
lowLevel.dissonance 23 (23,)
lowLevel.hfc 23 (23,)
lowLevel.pitch 23 (23,)
lowLevel.pitch_instantaneous_confidence 23 (23,)
lowLevel.pitch_salience 23 (23,)
lowLevel.silence_rate_20dB 23 (23,)
lowLevel.silence_rate_30dB 23 (23,)
lowLevel.silence_rate_60dB 23 (23,)
lowLevel.spectral_centroid 23 (23,)
lowLevel.spectral_complexity 23 (23,)
lowLevel.spectral_crest 23 (23,)
lowLevel.spectral_decrease 23 (23,)
lowLevel.spectral_energy 23 (23,)
lowLevel.spectral_energyband_high 23 (23,)
lowLevel.spectral_energyband_low 23 (23,)
lowLevel.spectral_energyband_middle_high 23 (23,)
lowLevel.spectral_energyband_middle_low 23 (23,)
lowLevel.spectral_flatness_db 23 (23,)
lowLevel.spectral_flux 23 (23,)
lowLevel.spectral_kurtosis 23 (23,)
lowLevel.spectral_rms 23 (23,)
lowLevel.spectral_rolloff 23 (23,)
lowLevel.spectral_skewness 23 (23,)
lowLevel.spectral_spread 23 (23,)
lowLevel.spectral_stro



In [10]:
filename=r"/root/mir_datasets/mridangam_stroke/mridangam_stroke_1.5/E/230150__akshaylaya__bheem-e-015.wav"
# preprocessor=MuSClPreprocessor()
# val=preprocessor.BasicPreprocessor(filename,noise_remove=True,normalize_type='peak-normalize',target_peak=0.8).shape
# dc=dataset.load_tracks()
# for track in list(dc.keys())[:5]:
#     audio_signal=preprocessor.BasicPreprocessor(dc[track].audio_path,noise_remove=True,normalize_type='peak-normalize',target_peak=0.8)
#     if audio_signal.shape!=val:
#         print(audio_signal.shape,dc[track].audio_path)