# preprocessing of original data

In [52]:
import os
import re
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import soundfile as sf  
import audioread
from pydub import AudioSegment

In [None]:
def read_files(filedir, format='wav'):
    files = []
    for filname in os.listdir(filedir):
        if filname.endswith(format):
            files.append(filname)
            
    files.sort()
    print(f'no of {format} files: {len(files)}')
    return files

preprocessing



In [43]:
def excract_the_file_name(files):
    raagas = {}
    for filename in files:
        category  = (re.findall(r"^[a-zA-Z]+", filename))[0].lower()
        if category not in raagas.keys():
            raagas[category] = []
        raagas[category].append(filename)

    return raagas

In [46]:
mp3_filedir =  "data/raaga/"
wav_filedir = "data/wav_raaga/"
files_mp3 = read_files(mp3_filedir, format='mp3')

raagas = excract_the_file_name(files_mp3)

os.makedirs(wav_filedir, exist_ok=True)
for raaga, files in raagas.items():
    for file in files:
        y, sr = librosa.load(mp3_filedir + file)
        sound = AudioSegment.from_mp3(mp3_filedir+ file)
        sound.export(wav_filedir + file.replace(".mp3", ".wav"), format="wav")
        
print("done")

raagas = excract_the_file_name(read_files(wav_filedir, format='wav'))
print(raagas)

no of files: 82
done
no of files: 82
{'bhairavi': ['Bhairavi01.wav', 'Bhairavi02.wav', 'Bhairavi03.wav', 'Bhairavi04.wav', 'Bhairavi05.wav', 'bhairavi27.wav', 'bhairavi28.wav', 'bhairavi29.wav', 'bhairavi30.wav', 'bhairavi31.wav'], 'dkanada': ['DKanada01.wav', 'DKanada02.wav', 'DKanada03.wav', 'DKanada04.wav', 'DKanada05.wav'], 'asavari': ['asavari01.wav', 'asavari02.wav', 'asavari03.wav', 'asavari04.wav', 'asavari05.wav', 'asavari25.wav', 'asavari26.wav', 'asavari27.wav', 'asavari28.wav', 'asavari29.wav'], 'bageshree': ['bageshree01.wav', 'bageshree02.wav', 'bageshree03.wav', 'bageshree04.wav', 'bageshree05.wav', 'bageshree24.wav', 'bageshree25.wav', 'bageshree27.wav', 'bageshree28.wav', 'bageshree29.wav', 'bageshree30.wav', 'bageshree31.wav'], 'bhoop': ['bhoop01.wav', 'bhoop02.wav', 'bhoop03.wav', 'bhoop04.wav', 'bhoop05.wav'], 'bhoopali': ['bhoopali21.wav', 'bhoopali22.wav', 'bhoopali23.wav', 'bhoopali24.wav', 'bhoopali25.wav'], 'darbari': ['darbari26.wav', 'darbari27.wav', 'darbari

In [48]:
def splitter(audio_file, sr, category, index):
    duration = 20
    samples_per_track = duration * sr
    for i in range(0, len(audio_file), samples_per_track):
        if len(audio_file[i:i+samples_per_track]) == samples_per_track:
            sf.write(f"raaga/{category}/{category}_{index}.wav", audio_file[i:i+samples_per_track], sr)
            index += 1
    return index
    

for key in raagas.keys():
    os.makedirs(f"raaga/{key}")
    i = 0
    for file in raagas[key]:
        librosa_audio, librosa_sample_rate = librosa.load(wav_filedir+file)
        i = splitter(librosa_audio, librosa_sample_rate, key, i)

# preprocessing of training data

In [53]:
def features_extractor(audio_path):
    audio, sample_rate = librosa.load(audio_path, res_type='scipy')
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

    return mfccs_scaled_features

In [54]:
data = []
l = ["raaga/"+l for l in os.listdir("raaga")]

for filepath in l:
    for wav_files in os.listdir(filepath):
        if wav_files.endswith(".wav"):
            audio_path = os.path.join(filepath, wav_files)
            mfccs = features_extractor(audio_path)
            if mfccs is not None:
                class_label = wav_files.split('.')[0]
                data.append([mfccs, class_label])

In [70]:
df = pd.DataFrame(data, columns=['the extracted feature', 'class'])

In [71]:
df

Unnamed: 0,the extracted feature,class
0,"[-294.72336, 66.38028, -4.563169, 28.187496, -...",darbari_4
1,"[-296.6297, 65.154724, -7.1088257, 27.06044, -...",darbari_0
2,"[-290.2766, 67.15734, -6.7709665, 25.243633, -...",darbari_1
3,"[-292.5677, 64.556, -4.9036336, 28.469217, -8....",darbari_3
4,"[-299.14185, 66.71693, -2.1876426, 29.160795, ...",darbari_2
...,...,...
582,"[-294.55405, 64.03945, -6.8834014, 36.657455, ...",asavari_59
583,"[-322.85086, 69.57714, -2.2130072, 38.44013, -...",asavari_73
584,"[-332.06735, 68.83567, -3.1626728, 32.966625, ...",asavari_67
585,"[-319.1917, 74.266655, -3.88584, 29.038456, -4...",asavari_66


In [72]:
# Function to truncate numeric characters from class labels
def truncate_numbers(class_label):
    return re.findall(r"^[a-z]+", class_label)[0]

# Applying the function to truncate numbers from class labels
df['class'] = df['class'].apply(truncate_numbers)

In [73]:
df

Unnamed: 0,the extracted feature,class
0,"[-294.72336, 66.38028, -4.563169, 28.187496, -...",darbari
1,"[-296.6297, 65.154724, -7.1088257, 27.06044, -...",darbari
2,"[-290.2766, 67.15734, -6.7709665, 25.243633, -...",darbari
3,"[-292.5677, 64.556, -4.9036336, 28.469217, -8....",darbari
4,"[-299.14185, 66.71693, -2.1876426, 29.160795, ...",darbari
...,...,...
582,"[-294.55405, 64.03945, -6.8834014, 36.657455, ...",asavari
583,"[-322.85086, 69.57714, -2.2130072, 38.44013, -...",asavari
584,"[-332.06735, 68.83567, -3.1626728, 32.966625, ...",asavari
585,"[-319.1917, 74.266655, -3.88584, 29.038456, -4...",asavari


In [93]:
df['the extracted feature'] = df['the extracted feature'].to_list()

In [94]:
np.array(df['the extracted feature'].to_list()).shape

(587, 40)

In [95]:
df['the extracted feature'].shape

(587,)

In [96]:
df.to_csv("data.csv", index=False)