In [1]:
import numpy as np
import pandas as pd
from scipy.io import wavfile
from python_speech_features import mfcc
from sklearn.mixture import GaussianMixture

In [2]:
#reading input audio files

In [3]:
df = pd.DataFrame()
persons=["Abhay", "Eknath", "Rg", "Rishika", "Vaibhav", "ShivamY"]
#person-class mapping dictionary
class_ = {"Abhay": 0, "Eknath": 1, "Rg": 2, "Rishika": 3, "Vaibhav": 4, "ShivamY": 5}

#function to populate the dataframe
def add_train_samples():
    dfs_to_concat = []  
    for person, class_value in class_.items():
        person_dfs = [] 
        for i in range(5):
            filepath = f"Voice_Samples_Training/{person}-00{class_value + 1}/{person}_{i + 1}.wav"
            frequency_sampling, audio_signal = wavfile.read(filepath)
            features_mfcc = mfcc(audio_signal, frequency_sampling, nfft=1103)          
            person_df = pd.DataFrame(features_mfcc, columns=[f"Feature{i}" for i in range(features_mfcc.shape[1])])
            person_df['Class'] = int(class_value)  
            person_dfs.append(person_df)
        person_combined_df = pd.concat(person_dfs, ignore_index=True)
        dfs_to_concat.append(person_combined_df)

    final_combined_df = pd.concat(dfs_to_concat, ignore_index=True) 
    return final_combined_df

df = add_train_samples() 

In [4]:
df0=df[df['Class']==0] #Abhay
df1=df[df['Class']==1] #Eknath
df2=df[df['Class']==2] #Rg
df3=df[df['Class']==3] #Rishika
df4=df[df['Class']==4] #Vaibhav
df5=df[df['Class']==5] #ShivamY

#combined dfs list
dfs=[df0,df1,df2,df3,df4,df5]

In [5]:
def get_optimal_gmm(dataf):
    score = None
    best_model = None
    #running loop with different no. of gaussian components to get the best gmm model
    for i in range(2,5):
        gmm = GaussianMixture(n_components=i, random_state=5)
        gmm.fit(dataf.iloc[:,:13])
        score_val = gmm.score(dataf.iloc[:,:13])
        if(score==None):
            score = score_val
            best_model = gmm
        else:
            if(score_val>score):
                score=score_val
                best_model = gmm
    return best_model

In [6]:
GMM_models = []
for i in range(6):
    GMM_models.append(get_optimal_gmm(dfs[i]))    

In [7]:
GMM_models

[GaussianMixture(n_components=4, random_state=5),
 GaussianMixture(n_components=4, random_state=5),
 GaussianMixture(n_components=4, random_state=5),
 GaussianMixture(n_components=4, random_state=5),
 GaussianMixture(n_components=4, random_state=5),
 GaussianMixture(n_components=4, random_state=5)]

In [8]:
audio_files=[]
true_label = []
predicted_speakers=[]
predicted_labels=[]

def predict_speaker(audio_file):
    frequency_sampling, audio_signal = wavfile.read(audio_file)
    features_mfcc = mfcc(audio_signal, frequency_sampling, nfft=1103)
    person_df = pd.DataFrame(features_mfcc, columns=[f"Feature{i}" for i in range(features_mfcc.shape[1])])
    score = None
    best_model = None
    for i in range(len(GMM_models)):
        gmm = GMM_models[i]
        score_val = gmm.score(person_df)
        if(score==None):
            score = score_val
            best_model = i
        else:
            if(score_val>score):
                score=score_val
                best_model = i
                
    predicted_speaker = best_model
    predicted_speakers.append(persons[predicted_speaker])
    predicted_labels.append(predicted_speaker)


In [9]:
def test_model():
    
    # Abhay
    for i in range(5):
        filepath = f"Testing_Audio/Abhay_audio{i + 1}.wav"
        audio_files.append(f"Abhay_audio{i + 1}.wav")
        true_label.append(class_["Abhay"])
        predict_speaker(filepath)
        
        filepath = f"Testing_Audio/Abhay-P1/Abhay_{15 + i}.wav"
        audio_files.append(f"Abhay_{15 + i}.wav")
        true_label.append(class_["Abhay"])
        predict_speaker(filepath)
        
    filepath = f"Testing_Audio/P1.wav"
    audio_files.append("P1.wav")
    true_label.append(class_["Abhay"])
    predict_speaker(filepath)
    
    
    #  Eknath
    for i in range(5):
        filepath = f"Testing_Audio/Eknath_audio{i + 1}.wav"
        audio_files.append(f"Eknath_audio{i + 1}.wav")
        true_label.append(class_["Eknath"])
        predict_speaker(filepath)
        
    filepath = f"Testing_Audio/P2.wav"
    audio_files.append("P2.wav")
    true_label.append(class_["Eknath"])
    predict_speaker(filepath)
    
    
    #  Rg
    for i in range(5):
        filepath = f"Testing_Audio/Rg_audio{i + 1}.wav"
        audio_files.append(f"Rg_audio{i + 1}.wav")
        true_label.append(class_["Rg"])
        predict_speaker(filepath)
        
        filepath = f"Testing_Audio/Rg-P3/Rg_{16 + i}.wav"
        audio_files.append(f"Rg_{16 + i}.wav")
        true_label.append(class_["Rg"])
        predict_speaker(filepath)
        
    filepath = f"Testing_Audio/P3.wav"
    audio_files.append("P3.wav")
    true_label.append(class_["Rg"])
    predict_speaker(filepath)
    
        
    #  Rishika  
    for i in range(5):
        filepath = f"Testing_Audio/Rishika_audio{i + 1}.wav"
        audio_files.append(f"Rishika_audio{i + 1}.wav")
        true_label.append(class_["Rishika"])
        predict_speaker(filepath)
        
    filepath = f"Testing_Audio/P4.wav"
    audio_files.append("P4.wav")
    true_label.append(class_["Rishika"])
    predict_speaker(filepath)
    
    
    #  ShivamY 
    for i in range(5):
        filepath = f"Testing_Audio/ShivamY_audio{i + 1}.wav"
        audio_files.append(f"ShivamY_audio{i + 1}.wav")
        true_label.append(class_["ShivamY"])
        predict_speaker(filepath)

        filepath = f"Testing_Audio/ShivamY-P5/chappu_{6 + i}.wav"
        audio_files.append(f"chappu_{6 + i}.wav")
        true_label.append(class_["ShivamY"])
        predict_speaker(filepath)
        
    filepath = f"Testing_Audio/P5.wav"
    audio_files.append("P5.wav")
    true_label.append(class_["ShivamY"])
    predict_speaker(filepath)
    
    
    #  Vaibhav
    for i in range(5):
        filepath = f"Testing_Audio/Vaibhav_audio{i + 1}.wav"
        audio_files.append(f"Vaibhav_audio{i + 1}.wav")
        true_label.append(class_["Vaibhav"])
        predict_speaker(filepath)
        
        filepath = f"Testing_Audio/Vaibhav-P6/Vaibhav_{16 + i}.wav"
        audio_files.append(f"Vaibhav_{16 + i}.wav")
        true_label.append(class_["Vaibhav"])
        predict_speaker(filepath)

    filepath = f"Testing_Audio/P6.wav"
    audio_files.append("P6.wav")
    true_label.append(class_["Vaibhav"])
    predict_speaker(filepath)

In [10]:
test_model()

In [11]:
# tabular visualisation of output
header = f"| {'audio_files':<20} | {'true_label':<20} | {'predicted_speakers':<20} | {'predicted_labels':<20} |"
line = "-" * len(header)
print(line)
print(header)
print(line)

for i in range(len(audio_files)):
    row = f"| {audio_files[i]:<20} | {true_label[i]:<20} | {predicted_speakers[i]:<20} | {predicted_labels[i]:<20} |"
    print(row)
print(line)


---------------------------------------------------------------------------------------------
| audio_files          | true_label           | predicted_speakers   | predicted_labels     |
---------------------------------------------------------------------------------------------
| Abhay_audio1.wav     | 0                    | Abhay                | 0                    |
| Abhay_15.wav         | 0                    | Abhay                | 0                    |
| Abhay_audio2.wav     | 0                    | Abhay                | 0                    |
| Abhay_16.wav         | 0                    | Abhay                | 0                    |
| Abhay_audio3.wav     | 0                    | Abhay                | 0                    |
| Abhay_17.wav         | 0                    | Abhay                | 0                    |
| Abhay_audio4.wav     | 0                    | Abhay                | 0                    |
| Abhay_18.wav         | 0                    | Abhay       

In [12]:
correct_classifications = 0
for i in range(len(true_label)):
    if(true_label[i]==predicted_labels[i]):
        correct_classifications+=1
accuracy = correct_classifications/len(audio_files)
print(f"Accuracy of the model is {accuracy*100}")

Accuracy of the model is 96.42857142857143
