In [66]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import datasets
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline


In [3]:
normal_y, normal_sr = librosa.load('data/fan/id_00/normal/00000000.wav')

In [4]:
abnormal_y, abnormal_sr = librosa.load('data/fan/id_00/abnormal/00000000.wav')

In [11]:
y = [*normal_y,*abnormal_y]
df = pd.DataFrame(y,columns=['amplitude'])
df['target'] = [0]*len(y)
df.loc[:(len(y)/2)-1,'target']=0
df.loc[len(y)/2:len(y)-1,'target']=1
df

Unnamed: 0,amplitude,target
0,-0.014585,0
1,-0.017370,0
2,-0.015581,0
3,-0.016613,0
4,-0.016445,0
...,...,...
440995,0.014003,1
440996,0.011594,1
440997,0.011231,1
440998,0.013799,1


In [12]:
df['target'].value_counts()


0    220500
1    220500
Name: target, dtype: int64

In [47]:
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:1].values,df.iloc[:,1],test_size=0.2)

In [48]:
pipeline_lr = Pipeline([('scalar1',StandardScaler()),
                        ('lr_classifier',LogisticRegression(random_state=0))])

In [49]:
pipeline_rf = Pipeline([('scalar2',StandardScaler()),
                        ('rf_classifier',RandomForestClassifier(n_estimators=int((len(df.columns)**0.5//1)), criterion= 'entropy',random_state=0))])

In [50]:
pipeline_dt = Pipeline([('scalar3',StandardScaler()),
                        ('dt_classifier',DecisionTreeClassifier(criterion="entropy",random_state=0))])

In [51]:
pipeline_GaussianNB = Pipeline([('scalar4',StandardScaler()),
                        ('dt_classifier',GaussianNB())])

In [52]:
pipelines = [pipeline_lr,pipeline_rf,pipeline_dt,pipeline_GaussianNB]

In [53]:
best_accuracy=0.0
best_classifier_indx = 0
best_pipeline=""

In [54]:
pipe_dict = {0:'Logistic Regression',1:'Random Forest',2:'Decision Tree',3:'GaussianNB'}

In [55]:
for pipe in pipelines:
    pipe.fit(X_train,y_train)

In [56]:
for i,model in enumerate(pipelines):
    print(f"{pipe_dict[i]} test accuracy: {model.score(X_test,y_test)} ")
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=  model.score(X_test,y_test)
        best_pipeline=model
        best_classifier_idnx = i
print(f"Classifier with the best accuracy:{pipe_dict[best_classifier_idnx]}")

Logistic Regression test accuracy: 0.5177777777777778 
Random Forest test accuracy: 0.5062244897959184 
Decision Tree test accuracy: 0.5043877551020408 
GaussianNB test accuracy: 0.5409297052154195 
Classifier with the best accuracy:GaussianNB


In [57]:
df

Unnamed: 0,amplitude,target
0,-0.014585,0
1,-0.017370,0
2,-0.015581,0
3,-0.016613,0
4,-0.016445,0
...,...,...
440995,0.014003,1
440996,0.011594,1
440997,0.011231,1
440998,0.013799,1


In [69]:
def features_extractor(file):
    audio,sample_rate = librosa.load(file,res_tupe = kaiser_fast)
    mfccs_features = librosa.feature.mfcc(y=audio,sr=sample_rate,n_mfcc=40)
    mfcc_scaled_features = np.mean(mfccs.T,axis=0)
    
    return mfccs_scaled_features

In [68]:
import numpy as np
from tqdm import tqdm

extracted_featurs = []
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str() )
    final_class_labels = row["class"]
    daya = features_extractor(file_name)
    extracted_features.append([data,final_class_labels])

In [None]:
extracted_features_df = pd.DataFrame(extracted_features,columns=['feature','class'])