In [120]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import soundfile as sf
from scipy.fft import rfft, rfftfreq
import librosa
import resampy
import librosa.display
import IPython.display as ipd
import os
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from pydub import AudioSegment
import os
warnings.filterwarnings('ignore')


In [59]:
def mfcc_feature_extractor(audio, sampleRate):
    mfccsFeatures = librosa.feature.mfcc(y=audio, sr=sampleRate, n_mfcc=40)
    mfccsScaledFeatures = np.mean(mfccsFeatures.T, axis=0)
    return mfccsScaledFeatures

In [60]:
def contrast_feature_extractor(audio, sampleRate):
    stft = np.abs(librosa.stft(audio))
    contrast = librosa.feature.spectral_contrast(S=stft, sr=sampleRate)
    contrastScaled = np.mean(contrast.T, axis=0)
    return contrastScaled

In [61]:
def tonnetz_feature_extractor(audio, sampleRate):
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(audio), sr=sampleRate)
    tonnetzScaled = np.mean(tonnetz.T, axis=0)
    return tonnetzScaled

In [62]:
def chroma_feature_extractor(audio,sampleRate):
    stft = np.abs(librosa.stft(audio))
    chroma = librosa.feature.chroma_stft(S=stft,sr=sampleRate)
    chromaScaled = np.mean(chroma.T,axis=0)
    print("croma  ", chromaScaled)
    return chromaScaled

In [63]:
def centroid_feature_extractor(audio, sampleRate):
    centroid = librosa.feature.spectral_centroid(y=audio, sr=sampleRate)
    centroidScaled = np.mean(centroid.T, axis=0)
    print("cntroid  ", centroidScaled)
    return centroidScaled

In [64]:
def pitch_feature_extractor(audio, sample_rate):
    pitches, magnitudes = librosa.core.piptrack(y=audio, sr=sample_rate)
    pitch_mean = np.mean(pitches[pitches > 0])
    print("ptch mean ",pitch_mean)
    pitch_range = np.max(pitches) - np.min(pitches)
    return pitch_mean

In [65]:
def energy_feature_extractor(audio, sample_rate):
    rmse = librosa.feature.rms(y=audio)
    energy_mean = np.mean(rmse)
    print("energy mean ",energy_mean)
    return energy_mean

In [66]:
def rhythm_feature_extractor(audio, sample_rate):
    onset_env = librosa.onset.onset_strength(y=audio, sr=sample_rate)
    tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sample_rate)
    print("tempo", tempo)
    tempo = np.mean(tempo)
    return tempo

In [67]:
def features_extractor(file):
    if file.lower().endswith(".m4a"):
        audio = AudioSegment.from_file(file, format="m4a")
        file_name, _ = os.path.splitext(os.path.basename(file))
        output_path = os.path.join(os.path.dirname(file), file_name + ".wav")
        audio.export(output_path, format="wav")
        # os.remove(file)
        file = output_path

    features = []
    audio, sample_rate = librosa.load(file, res_type='kaiser_fast') 
    mfcc = mfcc_feature_extractor(audio, sample_rate)
    contrast = contrast_feature_extractor(audio, sample_rate)
    tonnetz = tonnetz_feature_extractor(audio, sample_rate)
    chroma = chroma_feature_extractor(audio, sample_rate)

    # Additional Features
    centroid = centroid_feature_extractor(audio, sample_rate)
    pitch, pitch_range = pitch_feature_extractor(audio, sample_rate)
    energy = energy_feature_extractor(audio, sample_rate)
    rhythm = rhythm_feature_extractor(audio, sample_rate)

    # Concatenate all features
    concatenated_features = np.concatenate((mfcc, contrast, tonnetz, chroma, centroid, pitch, pitch_range, energy, rhythm), axis=0)
    features.append(concatenated_features)

    return features

In [68]:
def add_features(extractedFeatures, dirPath, label):
    for file in os.listdir(dirPath):
        filePath = os.path.join(dirPath, file)

        if filePath.lower().endswith(".m4a"):
            audio = AudioSegment.from_file(filePath, format="m4a")
            file_name, _ = os.path.splitext(os.path.basename(filePath))
            output_path = os.path.join(os.path.dirname(filePath), file_name + ".wav")
            audio.export(output_path, format="wav")
            # os.remove(filePath)
            filePath = output_path

        audio, sampleRate = librosa.load(filePath, res_type='kaiser_fast') 
        mfcc = mfcc_feature_extractor(audio, sampleRate)
        contrast = contrast_feature_extractor(audio, sampleRate)
        tonnetz = tonnetz_feature_extractor(audio, sampleRate)
        chroma = chroma_feature_extractor(audio, sampleRate)
        centroid = centroid_feature_extractor(audio, sampleRate)
        pitch = pitch_feature_extractor(audio, sampleRate)
        energy = energy_feature_extractor(audio, sampleRate)
        rhythm = rhythm_feature_extractor(audio, sampleRate)

        extractedFeatures.append([mfcc, contrast, tonnetz, chroma, centroid, pitch, energy, rhythm, label])

In [69]:
dict1 = {
"open":[
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/open/ahmeda",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/open/ahmedk",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/open/hassan",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/open/hazem",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/open/ibrahim",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/open/mohannad",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/open/omar",
],

"unlock":[
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/unlock/ahmeda",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/unlock/ahmedk",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/unlock/hassan",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/unlock/hazem",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/unlock/ibrahim",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/unlock/mohannad",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/unlock/omar",
],

"grant":[
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/grant/ahmeda",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/grant/ahmedk",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/grant/hassan",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/grant/hazem",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/grant/ibrahim",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/grant/mohannad",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/grant/omar",

],
"other":[
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/other sentences/ahmeda",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/other sentences/ahmedk",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/other sentences/hassan",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/other sentences/hazem",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/other sentences/ibrahim",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/other sentences/mohannad",
"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/other sentences/omar",
]
}


In [70]:
extractedFeatures = []
for i in dict1.keys():
    for x in dict1[i]:
        add_features(extractedFeatures, x, i)

croma   [0.5946693  0.6137347  0.62787664 0.614527   0.5042486  0.4151953
 0.43360034 0.44911465 0.53862524 0.614247   0.6349716  0.6209179 ]
cntroid   [1216.32381508]
ptch mean  928.4121
energy mean  0.01918658
tempo 151.99908088235293
croma   [0.6336483  0.6737882  0.6588121  0.6245098  0.5495568  0.49105504
 0.44927827 0.4989933  0.61403096 0.6267509  0.6365642  0.65418994]
cntroid   [1091.03490907]
ptch mean  842.0918
energy mean  0.01462504
tempo 151.99908088235293
croma   [0.5846696  0.6051889  0.6057149  0.5705867  0.464287   0.4517521
 0.50352234 0.5339707  0.599329   0.5957749  0.55800164 0.55815345]
cntroid   [1112.71701885]
ptch mean  814.36096
energy mean  0.015016647
tempo 151.99908088235293
croma   [0.63430023 0.62471426 0.6044777  0.5922242  0.5274836  0.4406907
 0.4293781  0.47947618 0.5896408  0.66050196 0.63385373 0.6312893 ]
cntroid   [1059.30419292]
ptch mean  889.41846
energy mean  0.013971805
tempo 151.99908088235293
croma   [0.58916926 0.61924756 0.6116577  0.584

In [71]:
data = pd.DataFrame(extractedFeatures, columns=['mfcc','contrast','tonnetz','chroma', 'centroid','pitch','energy','rhythm', 'class'])
print(data.shape)
data.to_csv('password.csv', index=False)
data.describe()

(553, 9)


Unnamed: 0,pitch,energy,rhythm
count,550.0,553.0,553.0
mean,1108.753174,0.020404,126.004919
std,333.271698,0.016856,28.321024
min,504.969635,0.0,0.0
25%,862.043457,0.010484,107.666016
50%,1030.441162,0.015939,123.046875
75%,1280.629639,0.022535,143.554688
max,2071.038086,0.09757,234.90767


In [72]:
print(data.columns)

Index(['mfcc', 'contrast', 'tonnetz', 'chroma', 'centroid', 'pitch', 'energy',
       'rhythm', 'class'],
      dtype='object')


In [109]:
features = data.iloc[:, 0:-1]
target = data['class']
target.unique()

array(['open', 'unlock', 'grant', 'other'], dtype=object)

In [110]:
features = features.values.tolist()
for i in range(len(features)):
    features[i] = np.concatenate((
        np.atleast_1d(features[i][0]),
        np.atleast_1d(features[i][1]),
        np.atleast_1d(features[i][2]),
        np.atleast_1d(features[i][3]),
        np.atleast_1d(features[i][4]),
        np.atleast_1d(np.squeeze(features[i][5])),  
        np.atleast_1d(np.squeeze(features[i][6])),
        np.atleast_1d(features[i][7])
    ))

In [111]:
encoder = LabelEncoder()
target = encoder.fit_transform(target)

In [123]:
class_counts = np.bincount(target)

for class_label, count in enumerate(class_counts):
    print(f"Class {class_label}: {count} instances")

Class 0: 178 instances
Class 1: 141 instances
Class 2: 83 instances
Class 3: 151 instances


In [124]:
xTrain, xTest, yTrain, yTest = train_test_split(features, target, test_size=0.25, random_state=105)
imputer = SimpleImputer(strategy='mean')
xTrain_imputed = imputer.fit_transform(xTrain)
print(xTrain)

[array([-5.88122620e+02,  6.44887772e+01, -3.94774437e-01,  5.64003792e+01,
        5.59549093e-01,  1.74333553e+01, -9.07747507e-01,  4.89814758e-01,
       -7.82584143e+00,  5.46245766e+00, -1.09336081e+01,  4.80909252e+00,
       -3.38731503e+00, -1.09901352e+01,  6.03764009e+00, -6.03220940e-01,
       -1.35487928e+01, -4.39634943e+00,  5.55508041e+00, -5.45292282e+00,
        1.04062200e+00, -2.01267838e+00, -4.57291937e+00, -2.71554828e+00,
       -3.47059917e+00, -3.67188382e+00,  2.08856797e+00, -6.54011154e+00,
       -3.73318791e+00, -7.10195303e+00, -9.56674194e+00,  1.16183615e+00,
       -7.36134338e+00, -4.60072374e+00,  4.82325935e+00,  1.82811701e+00,
        1.30729759e+00, -3.67613465e-01,  1.50111899e-01,  1.66352904e+00,
        2.44662035e+01,  1.58271867e+01,  1.78592097e+01,  1.53144759e+01,
        2.00424384e+01,  2.32752706e+01,  3.85466384e+01,  1.72455319e-02,
       -8.51617110e-02, -3.19341663e-02, -1.41157194e-02,  1.21182416e-02,
       -1.66840604e-02, 

### creating RandomForest model

In [125]:
classifier = RandomForestClassifier(n_estimators=100, criterion="entropy")
classifier.fit(xTrain,yTrain)

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

### creating SVM model

In [89]:
# classifier = SVC(kernel='linear',decision_function_shape="ovo" ,probability=True) 
# classifier.fit(xTrain,yTrain)

### creating XGBoost model

In [90]:
# classifier = XGBClassifier(objective='multi:softmax', num_class=4, eval_metric='mlogloss')
# classifier.fit(xTrain, yTrain)

In [91]:
preds = classifier.predict(xTest)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [92]:
print(classification_report(yTest, preds))

NameError: name 'preds' is not defined

In [93]:
def prediction(pred):
    if pred == 0:
        print("grant")
    elif pred == 1:
        print("open")
    elif pred == 2:
        print("other")
    elif pred == 3:
        print("unlock")

In [94]:
testFeatures = features_extractor(f"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/dataset/recorded/2.m4a") 
pred = classifier.predict(testFeatures)
prediction(pred)
print(classifier.predict_proba(testFeatures))

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/dataset/recorded/2.m4a'

In [None]:
# testFeatures = features_extractor("C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/open/omar/omar_open_middle_door_1.wav") # open middle door
# pred = classifier.predict(testFeatures)
# prediction(pred)
# print(classifier.predict_proba(testFeatures))

In [None]:
testFeatures = features_extractor("C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/unlock/mohannad/Recording (57).wav") # unlock the gate
pred = classifier.predict(testFeatures)
prediction(pred)
print(classifier.predict_proba(testFeatures))

unlock
[[0.15 0.06 0.01 0.78]]


In [None]:
for file in os.listdir("C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/open/omar")[0:10]:
  testFeatures = features_extractor(f"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/open/omar/{file}") 
  pred = classifier.predict(testFeatures)
  prediction(pred) 
  print(classifier.predict_proba(testFeatures))

open
[[0.04 0.86 0.02 0.08]]
open
[[0.04 0.86 0.02 0.08]]
open
[[0.01 0.94 0.03 0.02]]
open
[[0.01 0.94 0.03 0.02]]
open
[[0.   0.92 0.05 0.03]]
open
[[0.   0.92 0.05 0.03]]
open
[[0.   0.95 0.04 0.01]]
open
[[0.   0.95 0.04 0.01]]
open
[[0.01 0.93 0.04 0.02]]
open
[[0.01 0.93 0.04 0.02]]


In [None]:
for file in os.listdir("C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/open/hazem")[0:5]:
  testFeatures = features_extractor(f"C:/Users/omara/OneDrive/Desktop/Voice Signal Authentication/ML/new audios/open/hazem/{file}") 
  pred = classifier.predict(testFeatures)
  prediction(pred)
  print(classifier.predict_proba(testFeatures))

open
[[0.04 0.71 0.11 0.14]]
open
[[0.   0.8  0.08 0.12]]
open
[[0.   0.75 0.12 0.13]]
open
[[0.01 0.7  0.14 0.15]]
open
[[0.06 0.47 0.32 0.15]]


In [None]:
pickle_out = open("password.pkl", "wb")
pickle.dump(classifier, pickle_out)
pickle_out.close()