In [None]:
# Install Packages
!pip install kaggle --upgrade --quiet
!pip install opendatasets --upgrade --quiet
!pip install librosa --upgrade --quiet
!pip install noisereduce --upgrade --quiet

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os
import opendatasets as od
import csv

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import seaborn as sns
import noisereduce as nr
from IPython.display import Audio
import librosa
import librosa.display
import wave

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

import prjLib as lib
import VictorLib as vic

import warnings
warnings.filterwarnings ('ignore')

In [None]:
# Parameters

FRAME_LEN = 2048 # 4096
HOP_LEN = 512 # 2048
N = 128

FIGX = 14
FIGY = 2

In [None]:
l_n_mfccs = np.arange(5,128)
lScore = np.zeros([len(l_n_mfccs),2])
for ii in tqdm(range(len(l_n_mfccs)), desc='Grid Searching model'):
    lScore[ii,0] = DataPreprocessing(l_n_mfccs[ii])
    lScore[ii,1] = l_n_mfccs[ii]
best_results = SortDataDescent(lScore,0)
best_score = best_results[0,0]
best_n_mfcc = best_results[0,1]
print(f'GridSearched: Best result for n_mfcc = {best_n_mfcc} with test score = {best_score}.')

In [None]:
def FeaturesGridSearch(frame_len,hop_len,N, plot):
    # Collect Data and Extract Features from audio files
    featured_data = []  # stores the features
    labels = []  # stores the labels
    for data_path in tqdm(data_paths,desc="Collecting Datasets"):
        root_dir = data_path[2]
        cpt = sum([len(d) for r, d, files in os.walk(root_dir)])+1
        for subdirs, dirs, files in tqdm(os.walk(root_dir), total=cpt, desc=f'Processing {data_path[0]} audio files', unit="file"):
            for file in files:
                if not '.wav' in file:
                    continue
                audio_file_path = os.path.join(subdirs, file)
                sample, sr = librosa.load(audio_file_path)
                # fix_sample = FixSamples(sample,sr,4)
                features = ExtractFeatures(sample,sr,frame_len,hop_len,N)
                label = ExtractLabel(file,data_path[0])
                featured_data.append(features)
                labels.append(label)
    featured_data = np.asarray(featured_data)
    labels = np.array(labels)
    print(f'Data with total {featured_data.shape[0]} samples collected with {featured_data.shape[1]} features.')

    # Data samples split
    train_data,train_labels,test_data,test_labels = lib.PlotSplitData(featured_data,labels,trainRatio=0.8,plot=plot)

    '''
    # Select features
    delete_features_indexes = np.arange(0,0)
    train_data = np.delete(train_data,delete_features_indexes,axis=1)
    test_data = np.delete(test_data,delete_features_indexes,axis=1)
    print(f'Selected {train_data.shape[1]} features.')
    '''

    # Data extrapulation of train data, for uniform histogram of classes
    dataL1 = train_data[(train_labels==1)]
    L1 = train_labels[(train_labels==1)]
    dataL7 = train_data[(train_labels==7)]
    L7 = train_labels[(train_labels==7)]
    train_data = np.concatenate((train_data,dataL1,dataL1,dataL1,dataL1,dataL1,dataL7),axis=0)
    train_labels = np.concatenate((train_labels,L1,L1,L1,L1,L1,L7),axis=0)
    print(f'Train Data extrapulated with total {train_data.shape[0]} samples collected with {train_data.shape[1]} features.')

    # Test features with a simple model
    n_estimators = 100
    min_samples_split = 6
    random_state = 512
    score, feature_importance = TestFeaturesInSimpleModel(train_data,train_labels,test_data,test_labels, n_estimators=n_estimators,min_samples_split=min_samples_split,random_state=random_state, plot=plot)
    return score


In [None]:
frame_len = 4096
hop_len = 2048
N = 14
F = 10
Z = 0.1
lN = [14,20,40,128]
bestScore = 0
bestN = 0
for N in tqdm(lN,desc="lN Grid Search"):
    score = FeaturesGridSearch(frame_len,hop_len,N,F,Z,plot=False)
    if(score > bestScore):
        bestScore = score
        bestN = N
print(f'Best score = {bestScore} of N = {bestN} .')
FeaturesGridSearch(frame_len,hop_len,bestN,F,Z,plot=True)