# DETAILED KNOWLEDGE OF THE
## FEATURE EXTRACTION AND DATASET CREATION

Firstly we have collected the audios from VoxCeleb and also a csv file(vox1_meta.csv).

The mentioned csv file will contain the gender of the persons speaking with the audio id's.

The gender and the id's are separated from this csv file and store in another csv file.

The Dataset is created in such a way that the most of the male and female voice are taken into account.

The features are extracted from audio in the function get_features.

Then the dataset is created with 16 columns.

The last column 'LABEL'is added in the dataset with the help of the dataset created from vox1_meta.csv.

Finally the Dataset is saved as dataset.csv.

While checking the gender of the person speaking in the audio,we have noticed that in audios where

a female with a deep voice result in wrong prediction.

So we have added some more audios to the existing dataset which we have collected from some of our 

friends. 

Click on the link to check how we have added more audios to the existing dataset.

<a href="http://localhost:8888/notebooks/Desktop/Final%20Year%20Project/DatasetCreation2.ipynb">CLICK HERE</a>

In [1]:
#IMPORT THE REQUIRED PACKAGES
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from scipy.io import wavfile 
from scipy.stats import stats
import multiprocessing
import concurrent.futures
import time
import librosa

In [2]:
#Read the csv file obtained from VoxCeleb to seaprate the genders 
read_file=pd.read_csv(r'C:\\Users\\SAGAR\\Desktop\\Final Year Project\\csv_dataset\\vox1_meta.csv')
read_file.head()

Unnamed: 0,VoxCeleb1 ID	VGGFace1 ID	Gender	Nationality	Set
0,id10001\tA.J._Buckley\tm\tIreland\tdev
1,id10002\tA.R._Rahman\tm\tIndia\tdev
2,id10003\tAamir_Khan\tm\tIndia\tdev
3,id10004\tAaron_Tveit\tm\tUSA\tdev
4,id10005\tAaron_Yoo\tm\tUSA\tdev


In [3]:
line=read_file.to_numpy()#convert the csv to numpy array
line

array([['id10001\tA.J._Buckley\tm\tIreland\tdev'],
       ['id10002\tA.R._Rahman\tm\tIndia\tdev'],
       ['id10003\tAamir_Khan\tm\tIndia\tdev'],
       ...,
       ['id11249\tZack_Snyder\tm\tUSA\tdev'],
       ['id11250\tZoe_Saldana\tf\tUSA\tdev'],
       ['id11251\tZulay_Henao\tf\tUSA\tdev']], dtype=object)

In [4]:
vgg_id=[]
vgg_name=[]
vgg_gender=[]
vgg_nation=[]
vgg_set=[]
for i in line:
    s=i[0]
    vg_id,vg_name,vg_gender,vg_nation,vg_set=s.split("\t")#split each array element on the basis of '/'
    #from this csv file we are only taking the audio names for further use
    #the gender and the nation
    vgg_id.append(vg_id)
    vgg_gender.append(vg_gender)
    vgg_nation.append(vg_nation)
csv_dict=({"VGG_ID":vgg_id,"LABEL":vgg_gender,"COUNTRY":vgg_nation})
csv_df=pd.DataFrame(csv_dict)#DataFrame is created with these three values  

In [5]:
array_label=csv_df.to_numpy()
id_1=[]
label_1=[]
country_1=[]
id_0=[]
label_0=[]
country_0=[]
for i in array_label:
    label=i[1]
    if label=='m':
        id_1.append(i[0])
        label_1.append(i[1])
        country_1.append(i[2])
    elif label=='f':
        id_0.append(i[0])
        label_0.append(i[1])
        country_0.append(i[2])
csv_label_1=({"VGG_ID":id_1,"LABEL":label_1,"COUNTRY":country_1}) 
csv_label_0=({"VGG_ID":id_0,"LABEL":label_0,"COUNTRY":country_0})
csv_label_1=pd.DataFrame(csv_label_1)
csv_label_0=pd.DataFrame(csv_label_0)

In [6]:
csv_label=pd.concat([csv_label_1,csv_label_0])
csv_label.index=range(len(csv_label))
csv_label.to_csv("C:\\Users\\SAGAR\\Desktop\\Final Year Project\\csv_dataset\\voice_dataset.csv",index=False)

In [7]:
def get_frequency(audio_file):
    #print(audio_file)
    sp_centroid=[]
    sp_flatness1=[]
    sp_rolloff1=[]
    sp_bandwidth1=[]
    freqArray1=[]
    freqArray2=[]
    pitch1=[]
    rate,data=wavfile.read(audio_file) 
    step=int(rate/5)
    #p=find_pitch(audio_file)
    
    x, sr = librosa.load(audio_file)
    spectral_centroids=librosa.feature.spectral_centroid(x,sr=sr)[0] #to find the spectral centroid of the audio
    flatness=librosa.feature.spectral_flatness(x)#to find the spectral flatness of the audio
    rolloff = librosa.feature.spectral_rolloff(x,sr=sr)#to find the spectral roll off of the audio
    bandwidth=librosa.feature.spectral_bandwidth(x,sr=sr)#to find the spectral bandwidth of the audio
    sp_centroid.append(np.mean(spectral_centroids))
    sp_flatness1.append(np.mean(flatness))
    sp_rolloff1.append(np.mean(rolloff))
    sp_bandwidth1.append(np.mean(bandwidth))
    #Find out the Dominating Frequency Using Fourier
    for j in range(0,len(data),step):
            fourier=np.fft.fft(data[j:j+step])
            freqs=np.fft.fftfreq(len(fourier))#freqs tells you the frequencies associated with the coefficients
            imax=np.argmax(np.abs(fourier))
            freq=freqs[imax]
            freq_in_hz=abs(freq *rate)
            freqArray1.append(freq_in_hz)
            x=[f for f in freqArray1 if 20<f<300 and not 46<f<66]
    freqArray2.append(x)
    sp_centroids=np.mean(sp_centroid)  
    sp_flatness=np.mean(sp_flatness1)
    sp_rolloff=np.mean(sp_rolloff1)
    sp_bandwidth=np.mean(sp_bandwidth1)
    freqArray=[item for sublist in freqArray2 for item in sublist]
    return freqArray,sp_centroids,sp_flatness,sp_rolloff,sp_bandwidth

In [8]:
def get_features(path):
    nobs1=[]
    index=[]
    mean1=[]
    variance1=[]
    skew1=[]
    kurtosis1=[]
    median1=[]
    mode1=[]
    std1=[]
    low1=[]
    peak1=[]
    q751=[]
    q251=[]
    iqr1=[]
    pitch1=[]
    low_pitch1=[]
    high_pitch1=[]
    sp_cent_1=[]
    sfm_1=[]
    sp_rolloff_1=[]
    sp_bandwidth_1=[]
    files=os.listdir(path)
    n=len(files)
    for i in range(0,n):
        path1=os.path.join(path,files[i])
        #print(path1)
        sound_file=os.listdir(path1)
        n1=len(sound_file)
        for j in range(n1):
            path2=os.path.join(path1,sound_file[j])
            audio_file=os.listdir(path2)
            n2=len(audio_file)
            freq=[]
            sp_cent1=[]
            sfm1=[]
            sp_rolloff1=[]
            sp_band1=[]
            path3=[]
            pitch_1=[]
            low_pitch1=[]
            high_pitch1=[]
            print(path2)
            for r in audio_file:
                p=os.path.join(path2,r)
                path3.append(p)
           #MultiProcessing is done to reduce the time it takes to extract features from so many audios     
            with concurrent.futures.ThreadPoolExecutor() as exec:
                for freq1,sp,sfm2,sroll,sband in exec.map(get_frequency,path3):
                    freq.append(freq1)
                    sp_cent1.append(sp)
                    sfm1.append(sfm2)
                    sp_rolloff1.append(sroll)
                    sp_band1.append(sband)
            freqArray=[item for sublist in freq for item in sublist]
            sp_cent=np.mean(sp_cent1)#to find the mean of all the spectral centroids obtained of an audio
            sfm=np.mean(sfm1)#to find the mean of all the spectral flatness obtained of an audio
            sp_rolloff=np.mean(sp_rolloff1)#to find the mean of all the spectral roll off obtained of an audio
            sp_bandwidth=np.mean(sp_band1)#to find the mean of all the spectral bandwidth obtained of an audio
            nobs,minmax,mean,variance,skew,kurtosis=stats.describe(freqArray)#to find out the dominating frquency,skewness and kurtosis of the audio
            mode=stats.mode(freqArray).mode[0]#to find out the mode of the frequencies of the audio
            std=np.std(freqArray)#to find out the standard deviation 
            low,peak=minmax#to find out the min and max dominating frequency
            q75,q25=np.percentile(freqArray, [75 ,25])#to find out first quartile and second quartile
            iqr=q75-q25#to find out the interquartile range
            median=np.median(freqArray)#to find out the median
        
            #Here the array for each column in the dataset is created by appending the value obtained for above
            #for the given audio
            index.append(files[i])
            mean1.append(mean/1000)
            skew1.append(skew)
            kurtosis1.append(kurtosis)
            median1.append(median/1000)
            mode1.append(mode/1000)
            std1.append(std/1000)
            low1.append(low/1000)
            peak1.append(peak/1000)
            q751.append(q75/1000)
            q251.append(q25/1000)
            iqr1.append(iqr/1000)
            sp_cent_1.append(sp_cent)
            sfm_1.append(sfm)
            sp_rolloff_1.append(sp_rolloff)
            sp_bandwidth_1.append(sp_bandwidth)
    d=({'ID':index,'meanfreq':mean1,'sd':std1,'median':median1,'Q25':q251,'Q75':q751,'iqr':iqr1,'skew':skew1,'kurt':kurtosis1,'mode':mode1,'low':low1,'peak':peak1,'centroid':sp_cent_1,'flatness':sfm_1,'spectral rolloff':sp_rolloff_1,'spectral bandwidth':sp_bandwidth_1})
    df=pd.DataFrame(data=d)
    return df    

In [None]:
path="D:\\training_dataset"
data=get_features(path)
files=data.to_numpy()
print("   NUMBER OF AUDIO FILES IN DATASET:")
print("=======================================")
n=len(files)
print(n)

D:\training_dataset\id10001\1zcIwhmdeo4
D:\training_dataset\id10001\7gWzIy6yIIk
D:\training_dataset\id10001\7w0IBEWc9Qw
D:\training_dataset\id10001\9mQ11vBs1wc
D:\training_dataset\id10001\DtdEYdViWdw
D:\training_dataset\id10001\eWIX7sfn-M0
D:\training_dataset\id10001\J9lHsKG98U8
D:\training_dataset\id10001\utrA-v8pPm4
D:\training_dataset\id10001\Y8hIVOBuels
D:\training_dataset\id10001\zELwAz2W6hM
D:\training_dataset\id10002\0_laIeN-Q44
D:\training_dataset\id10002\6WO410QOeuo
D:\training_dataset\id10002\C7k7C-PDvAA
D:\training_dataset\id10002\cMGEuZ1zqXk
D:\training_dataset\id10002\eNc4LrrvV80
D:\training_dataset\id10002\gaQqIoV_aLY
D:\training_dataset\id10002\Mpr9wqUuLQA
D:\training_dataset\id10002\QanuGhOhb9A
D:\training_dataset\id10002\QnnjJ9i5WFs
D:\training_dataset\id10002\RLKKsYiCMvc
D:\training_dataset\id10002\SI4D2_YXvBE
D:\training_dataset\id10002\TqUbiOgEb0w
D:\training_dataset\id10002\VMaXdHLz5Bk
D:\training_dataset\id10002\w9H-ZMvdE9M
D:\training_dataset\id10002\wd_7oYV4dsU


  **kwargs)
  ret = ret.dtype.type(ret / rcount)


D:\training_dataset\id10002\xTV-jFAUKcw
D:\training_dataset\id10002\Y2Gr1I2DO7M
D:\training_dataset\id10003\5ablueV_1tw
D:\training_dataset\id10003\A7Hh1WKmHsg
D:\training_dataset\id10003\A8SBCxYzJgs
D:\training_dataset\id10003\bDxy7bnj_bc
D:\training_dataset\id10003\BQxxhq4539A
D:\training_dataset\id10003\EGPV-Xa0LGk
D:\training_dataset\id10003\E_6MjfYr0sQ
D:\training_dataset\id10003\FfmnkloV_zg
D:\training_dataset\id10003\K5zRxtXc27s
D:\training_dataset\id10003\na8-QEFmj44
D:\training_dataset\id10003\NC70RWJDnMg
D:\training_dataset\id10003\tCq2LcKO6xY
D:\training_dataset\id10003\Tzn91xwBaWE
D:\training_dataset\id10003\yzIXg93UOIM
D:\training_dataset\id10003\_JpHD6VnJ3I
D:\training_dataset\id10006\0otHlFztX8I
D:\training_dataset\id10006\3MwyuwaVE50
D:\training_dataset\id10006\3RybHF5mX78
D:\training_dataset\id10006\5PBimrKDrJQ
D:\training_dataset\id10006\5tGaUGO_z50
D:\training_dataset\id10006\7qUfkhbDaqc
D:\training_dataset\id10006\7W9goO0lNrA
D:\training_dataset\id10006\88biLNZtnag


D:\training_dataset\id10026\dQhROKhai8Y
D:\training_dataset\id10026\DXN_rEFztlc
D:\training_dataset\id10026\JzrVApWpAdo
D:\training_dataset\id10026\kpkaaj2jx6k
D:\training_dataset\id10026\KzCToed2Rcc
D:\training_dataset\id10026\TuftZsOgdzs
D:\training_dataset\id10026\vkHXIe40wns
D:\training_dataset\id10026\Vs-FvXU_GBE
D:\training_dataset\id10026\WBjWoYv8WcI
D:\training_dataset\id10026\yczc53tYhS8
D:\training_dataset\id10027\50IAfJCypFI
D:\training_dataset\id10027\69_QrKoy-GA
D:\training_dataset\id10027\6cXKzqLQYfQ
D:\training_dataset\id10027\ccbXWh4og34
D:\training_dataset\id10027\E9_VY1Y5uL0
D:\training_dataset\id10027\FATQ1UkoIH4
D:\training_dataset\id10027\g2m1E3C9wY4
D:\training_dataset\id10027\mnJX0ydvfBw
D:\training_dataset\id10027\N4vE1gwQQQA
D:\training_dataset\id10027\orwnPXD5JIU
D:\training_dataset\id10027\P-7wcXaLUUs
D:\training_dataset\id10027\qWdjhdcaHyk
D:\training_dataset\id10027\QZuA-h2TORw
D:\training_dataset\id10027\UFrqp10LwLs
D:\training_dataset\id10027\USjCbjIpTuI


D:\training_dataset\id10041\3t-RnqGXDrM
D:\training_dataset\id10041\5g4NwozVJpA
D:\training_dataset\id10041\6TmD9opLBY8
D:\training_dataset\id10041\9fJl5wUSbMM
D:\training_dataset\id10041\9THGEtI-rv8
D:\training_dataset\id10041\airbUybc41Y
D:\training_dataset\id10041\Ap16js05dA8
D:\training_dataset\id10041\bkT-M07CrjA
D:\training_dataset\id10041\D3dHTB6l9r0
D:\training_dataset\id10041\eUXTdQY-XYE
D:\training_dataset\id10041\g_v7-sRZGWs
D:\training_dataset\id10041\HBF8LIr8RG4
D:\training_dataset\id10041\hJjhdy2R-xg
D:\training_dataset\id10041\jar0wj_nfJk
D:\training_dataset\id10041\LNlufCgIx_E
D:\training_dataset\id10041\LrXEcxBXPWA
D:\training_dataset\id10041\LvoztHyRXy8
D:\training_dataset\id10041\mcpkLfUBzeU
D:\training_dataset\id10041\n9rD9-kU2Mw
D:\training_dataset\id10041\nUH8iI0t4rE
D:\training_dataset\id10041\oKMQz0cxdQw
D:\training_dataset\id10041\Pg20jX3taeA
D:\training_dataset\id10041\PZni1fi5MTQ
D:\training_dataset\id10041\qsX7s9M3lbk
D:\training_dataset\id10041\rzxZntDzey0


D:\training_dataset\id10060\LQV_TmdoHQY
D:\training_dataset\id10060\nXEHcYcTl2g
D:\training_dataset\id10060\qONhYpo2OKg
D:\training_dataset\id10060\QyedehdxmAU
D:\training_dataset\id10060\u7KTRzyv_yE
D:\training_dataset\id10060\UIPg3goNdzE
D:\training_dataset\id10060\wbhGl_7xUUM
D:\training_dataset\id10060\XItoro6xkB8
D:\training_dataset\id10061\0Wgg3wCb5Ws
D:\training_dataset\id10061\6gAqaYX0Lig
D:\training_dataset\id10061\7nEHZZi6_nU
D:\training_dataset\id10061\8GGP83uAxI4
D:\training_dataset\id10061\8wq-50fDL_Y
D:\training_dataset\id10061\9aqJt_vOFEI
D:\training_dataset\id10061\9ULTOXn2tKY
D:\training_dataset\id10061\AcieZuBaXmA
D:\training_dataset\id10061\aQxvMdIyImI
D:\training_dataset\id10061\C0OsreKPaXk
D:\training_dataset\id10061\ChVXy6IHQsA
D:\training_dataset\id10061\FiS_89JghYQ
D:\training_dataset\id10061\Fn6rF6roHPk
D:\training_dataset\id10061\hcH7Vm3owZM
D:\training_dataset\id10061\it9OkIxQgZw
D:\training_dataset\id10061\Ixczeh9v-Y0
D:\training_dataset\id10061\J9cHmdyBX04


D:\training_dataset\id10092\qBLkypwoFiE
D:\training_dataset\id10092\RvBMUxYJxpI
D:\training_dataset\id10092\S8mkKrMGkmU
D:\training_dataset\id10092\vdncTmuqlpI
D:\training_dataset\id10092\vi8dbWAimF0
D:\training_dataset\id10092\Zd2TVtbInjk
D:\training_dataset\id10092\ztBbZUau1Qc
D:\training_dataset\id10098\0lGRbE-g7BU
D:\training_dataset\id10098\1BeZJUe-K3A
D:\training_dataset\id10098\5wXAGY1COow
D:\training_dataset\id10098\8f2ReesQMrs
D:\training_dataset\id10098\a0yFzrtncuk
D:\training_dataset\id10098\aE1yoBKJP-A
D:\training_dataset\id10098\anQQVJzTg-4
D:\training_dataset\id10098\ecQEh3QY3dQ
D:\training_dataset\id10098\EgZwDUHr-aQ
D:\training_dataset\id10098\g8oGChGhCtI
D:\training_dataset\id10098\jJLQc0W1fds
D:\training_dataset\id10098\L2W5wQELl_s
D:\training_dataset\id10098\oH6pW0Hc4lA
D:\training_dataset\id10098\TKKUJDaDMB8
D:\training_dataset\id10098\vCp7m5eFDAo
D:\training_dataset\id10098\VsS1-jhNn2A
D:\training_dataset\id10098\x5dzO9YpvRA
D:\training_dataset\id10098\ypN3C-nPMDA


D:\training_dataset\id10124\u4YFdRMs9fY
D:\training_dataset\id10124\VUQmcsrbtkg
D:\training_dataset\id10124\z0C5sd3qcXk
D:\training_dataset\id10124\ZuN8jy-VppY
D:\training_dataset\id10125\08wOLi1Nwsc
D:\training_dataset\id10125\0DK4jdEWXhA
D:\training_dataset\id10125\2fNrBNaY1m8
D:\training_dataset\id10125\4caC5o4XOtc
D:\training_dataset\id10125\5cCOrJUBaug
D:\training_dataset\id10125\90YNetVBMAQ
D:\training_dataset\id10125\AXyiWjtvhk0
D:\training_dataset\id10125\eL9vit9ZOmE
D:\training_dataset\id10125\HCwSRyoNx9Q
D:\training_dataset\id10125\imqsfPNhFBU
D:\training_dataset\id10125\iSGLirjPyfo
D:\training_dataset\id10125\iyHOYmDAYio
D:\training_dataset\id10125\K0ZFbs3I5Ro
D:\training_dataset\id10125\mtx9LVul6Fc
D:\training_dataset\id10125\QPHNXybdpdM
D:\training_dataset\id10125\rX1H5GLTRSI
D:\training_dataset\id10125\um19Q6kGSo0
D:\training_dataset\id10125\XRrAUnwmvII
D:\training_dataset\id10125\Y4a_LeHg1YQ
D:\training_dataset\id10126\1nHa056iG1E
D:\training_dataset\id10126\aHBeInJvdkk


D:\training_dataset\id10138\xIi3oPAC0ac
D:\training_dataset\id10138\YAnsbEv085w
D:\training_dataset\id10138\ZuGfZl-tca0
D:\training_dataset\id10141\2djfpVN_86k
D:\training_dataset\id10141\4cibjBno0FU
D:\training_dataset\id10141\6MR4tIkDTpg
D:\training_dataset\id10141\8s7zAquPd04
D:\training_dataset\id10141\9KhoZYmdIAI
D:\training_dataset\id10141\d7PeIXZqni4
D:\training_dataset\id10141\dZGwAFXVy5g
D:\training_dataset\id10141\EVng88l7lS8
D:\training_dataset\id10141\fCJOb-egwE8
D:\training_dataset\id10141\mwlugkOuFl0
D:\training_dataset\id10141\o9ErDgzUZJE
D:\training_dataset\id10141\q4A_IEyuPWM
D:\training_dataset\id10141\sgYGenlbsfI
D:\training_dataset\id10141\UNgiwug0gQ0
D:\training_dataset\id10141\UOxg8l4ykPQ
D:\training_dataset\id10141\WP1N65JPioc
D:\training_dataset\id10141\XqI9dns0kAs
D:\training_dataset\id10141\zHtmdnWUjcY
D:\training_dataset\id10141\_FwUjCIPjdc
D:\training_dataset\id10145\6hlIufEg-sY
D:\training_dataset\id10145\CH5lEZC3Miw
D:\training_dataset\id10145\e--gQ-YDVFk


D:\training_dataset\id10167\BXHj7xlYSs4
D:\training_dataset\id10167\cnqmxToUg5E
D:\training_dataset\id10167\D9nqnhtAlDI
D:\training_dataset\id10167\htRgyKETQ1Q
D:\training_dataset\id10167\hZNeJxx8_Do
D:\training_dataset\id10167\ISwFELK9aUo
D:\training_dataset\id10167\Jx8GYQSmVJM
D:\training_dataset\id10167\k4MsXDAahO4
D:\training_dataset\id10167\LF4dVas6FDc
D:\training_dataset\id10167\lJj9cYH7AZg
D:\training_dataset\id10167\Mv8yqWjcl3s
D:\training_dataset\id10167\OKluVFV8URU
D:\training_dataset\id10167\plDhnFBVzpk
D:\training_dataset\id10167\QGZMOy6srzI
D:\training_dataset\id10167\r4NcTmxH8DM
D:\training_dataset\id10167\RMmg7vVURmE
D:\training_dataset\id10167\RoCsorfgp4M
D:\training_dataset\id10167\rwhZnmjY6JY
D:\training_dataset\id10167\TQpK-Q7YPD8
D:\training_dataset\id10167\tT40nn85A1w
D:\training_dataset\id10167\u9NlX43okUs
D:\training_dataset\id10167\wG6Uyg_HQ1o
D:\training_dataset\id10167\wHNzaAzb9Yo
D:\training_dataset\id10167\WkSS6QonBZU
D:\training_dataset\id10167\WtQ2beJwYLY


D:\training_dataset\id10186\Sv0dKM3HZxc
D:\training_dataset\id10186\sx05f5evyqo
D:\training_dataset\id10186\V3AZe5siMf4
D:\training_dataset\id10186\vr36lRI4Jh4
D:\training_dataset\id10186\V_tzoDBiP8A
D:\training_dataset\id10188\0OniOYEgQOE
D:\training_dataset\id10188\3J3MdgDuw6I
D:\training_dataset\id10188\4N4lCaudYhs
D:\training_dataset\id10188\A24Ht4QXalQ
D:\training_dataset\id10188\aI3eCrBBIb8
D:\training_dataset\id10188\AKnyennLMzA
D:\training_dataset\id10188\D-bv3YvgXf0
D:\training_dataset\id10188\DJk_5DLXnxQ
D:\training_dataset\id10188\DT0fB8E7Rro
D:\training_dataset\id10188\F6HFk7JmW04
D:\training_dataset\id10188\h8GSm3D_hiw
D:\training_dataset\id10188\Jxuk7OKCqc8
D:\training_dataset\id10188\L-ltfAiM1PQ
D:\training_dataset\id10188\lJvl8NRBKic
D:\training_dataset\id10188\lkx4CDgjSC8
D:\training_dataset\id10188\P-rFZlcyTlI
D:\training_dataset\id10188\rQNaoSsQLzg
D:\training_dataset\id10188\TkVbedVcwos
D:\training_dataset\id10188\v8M4yVx0m_0
D:\training_dataset\id10188\xftevGLBr40


D:\training_dataset\id10200\vgEmA6im1wk
D:\training_dataset\id10200\w6SIoqrsVmc
D:\training_dataset\id10200\XJOIQhBDjvo
D:\training_dataset\id10200\zXyIgnlylbQ
D:\training_dataset\id10201\4jXzPU5UWGU
D:\training_dataset\id10201\5AkXy9lLA6w
D:\training_dataset\id10201\DJe_WaGBGIo
D:\training_dataset\id10201\dYghTM9be7g
D:\training_dataset\id10201\G5fF_3MbWSI
D:\training_dataset\id10201\jtRzXxrBzRU
D:\training_dataset\id10201\kd-mJw-BoLw
D:\training_dataset\id10201\Nj5nerPQPCg
D:\training_dataset\id10201\nW1ZwX6ImFI
D:\training_dataset\id10201\Puocje_yEmg
D:\training_dataset\id10201\Rr3m3ERDMzg
D:\training_dataset\id10201\suzGWjAoVDM
D:\training_dataset\id10201\UqPXVAOf1gA
D:\training_dataset\id10201\XisAXLqM70g
D:\training_dataset\id10201\Z0PgNWSwi9s
D:\training_dataset\id10201\_40uCvxxbv4
D:\training_dataset\id10201\_VqmjwPtTjQ
D:\training_dataset\id10202\22yFoicYk2w
D:\training_dataset\id10202\3zRcmh204No
D:\training_dataset\id10202\4EtvHjTP-mA
D:\training_dataset\id10202\6JjuwVmTXs8


D:\training_dataset\id10227\JZZ3zN2NYOo
D:\training_dataset\id10227\KFid1BdukbY
D:\training_dataset\id10227\kmLUiUGMXzE
D:\training_dataset\id10227\kP8b8vhasi8
D:\training_dataset\id10227\LzGnlM1ktjQ
D:\training_dataset\id10227\n3N7d1RZjAs
D:\training_dataset\id10227\ODaOjyB0yxM
D:\training_dataset\id10227\PggxXLX0DSo
D:\training_dataset\id10227\RGAqZZJ2lIk
D:\training_dataset\id10227\Tb4eHMeEISM
D:\training_dataset\id10227\tf1aaIdx3dQ
D:\training_dataset\id10227\uJpCVCxZz9M
D:\training_dataset\id10227\Vsp-G00eH90
D:\training_dataset\id10227\WSlRurPaRq0
D:\training_dataset\id10227\YEDm4QCiePQ
D:\training_dataset\id10228\5kdw_bOW-nc
D:\training_dataset\id10228\5MiImkK2KGg
D:\training_dataset\id10228\cTkPkW2qcTQ
D:\training_dataset\id10228\CULPm5jJFdU
D:\training_dataset\id10228\DE6fBJfyqAQ
D:\training_dataset\id10228\e-24UEZU83Y
D:\training_dataset\id10228\FkPqCMyFZsM
D:\training_dataset\id10228\GWPlmKDW8L4
D:\training_dataset\id10228\i4GZflduqsY
D:\training_dataset\id10228\idBrNQwCRVg


D:\training_dataset\id10240\tWio9QgBKtY
D:\training_dataset\id10240\Ux9d5FnflUw
D:\training_dataset\id10240\yvVsrwgFk6U
D:\training_dataset\id10240\Z-BcJDa90EY
D:\training_dataset\id10241\0GmspbGMzfo
D:\training_dataset\id10241\3RdlFwdPjJ0
D:\training_dataset\id10241\5e994PbAWH8
D:\training_dataset\id10241\BCvkayRlsbM
D:\training_dataset\id10241\C5iZn8D-qzc
D:\training_dataset\id10241\CEod4dSkMIY
D:\training_dataset\id10241\CIshUJUvdbo
D:\training_dataset\id10241\DuLU2ODsOrQ
D:\training_dataset\id10241\FQDhy0gM7A8
D:\training_dataset\id10241\fwQLAQtZo5s
D:\training_dataset\id10241\giy_4x1_ZFQ
D:\training_dataset\id10241\GsRgxCuOxQs
D:\training_dataset\id10241\HLA0wJTDiaE
D:\training_dataset\id10241\hzCA5qbVxUk
D:\training_dataset\id10241\iMQNAdDAXpU
D:\training_dataset\id10241\jn06nFHC8qU
D:\training_dataset\id10241\k8lyIWukUC8
D:\training_dataset\id10241\lto5uHow8h8
D:\training_dataset\id10241\mfQ6eCKgDpo
D:\training_dataset\id10241\neIYr0ntxzQ
D:\training_dataset\id10241\NXpPpqQ-BmE


D:\training_dataset\id10252\eU56nK2YsGs
D:\training_dataset\id10252\FQ5arqvAXgU
D:\training_dataset\id10252\h3I_YuvvFnU
D:\training_dataset\id10252\hinOZS4DgyA
D:\training_dataset\id10252\I3TGi0YB4CU
D:\training_dataset\id10252\L4AUu_lRWYc
D:\training_dataset\id10252\o8I9tSWUoZM
D:\training_dataset\id10252\pgle0bchMG8
D:\training_dataset\id10252\PrN-0sF8OVo
D:\training_dataset\id10252\qzub2ZUWVeM
D:\training_dataset\id10252\rdJzSAMT8J4
D:\training_dataset\id10252\RHBcRc1Ra7E
D:\training_dataset\id10252\ti9CQnn5aZw
D:\training_dataset\id10252\tn7Roknj3EI
D:\training_dataset\id10252\vlSgvSve8lw
D:\training_dataset\id10252\WcldHVxK8WA
D:\training_dataset\id10252\wvjrGCX6zfg
D:\training_dataset\id10252\XTafvCcCLT0
D:\training_dataset\id10253\4kC7nIrvA3M
D:\training_dataset\id10253\8iLvO9G5GD8
D:\training_dataset\id10253\bqEyl0msaB0
D:\training_dataset\id10253\FhaxOw1_JB8
D:\training_dataset\id10253\P9N2VSjq77c
D:\training_dataset\id10253\pkCM7NUIZ58
D:\training_dataset\id10253\QDLTmHwekpI


D:\training_dataset\id10268\JkfveeYh1-8
D:\training_dataset\id10268\KDnGwU4mCeI
D:\training_dataset\id10268\khYVtFO6_9U
D:\training_dataset\id10268\rAHdNd6rQ0I
D:\training_dataset\id10268\RmRPXGQyNaM
D:\training_dataset\id10268\VwDn1yIBOVo
D:\training_dataset\id10268\wed75lDPS9s
D:\training_dataset\id10268\yHE4-gOiHRo
D:\training_dataset\id10268\_k-FScI0YUE
D:\training_dataset\id10268\_Q4QxnUNqbo
D:\training_dataset\id10269\0vc2JwLCGyg
D:\training_dataset\id10269\26IuUx3w-LI
D:\training_dataset\id10269\4wEwTcXkY-4
D:\training_dataset\id10269\cGufm-UfkpA
D:\training_dataset\id10269\cyIgvrAXNmU
D:\training_dataset\id10269\eIV480Vz-pc
D:\training_dataset\id10269\fBbAmURijSs
D:\training_dataset\id10269\h0L5sEc4Vkk
D:\training_dataset\id10269\HJZFGU3cU_A
D:\training_dataset\id10269\jbPZeDnMSQo
D:\training_dataset\id10269\nJcvkkjS6gs
D:\training_dataset\id10269\OacO5cTTIQw
D:\training_dataset\id10269\ojsBqIWFXDA
D:\training_dataset\id10269\oZtessx96JI
D:\training_dataset\id10269\PbT967mHtDE


D:\training_dataset\id10286\a3QzK5pWpI4
D:\training_dataset\id10286\cJ40ZqTY3dQ
D:\training_dataset\id10286\DPPq8w5ILrA
D:\training_dataset\id10286\FP4TghS5_UQ
D:\training_dataset\id10286\gkDeAS985L4
D:\training_dataset\id10286\iH0jGPqp3Hc
D:\training_dataset\id10286\isKyMAYUOgg
D:\training_dataset\id10286\J37FfHHefdc
D:\training_dataset\id10286\L2jRsPDZGgw
D:\training_dataset\id10286\lBkbWztMEUY
D:\training_dataset\id10286\lJQTN_wK9vA
D:\training_dataset\id10286\mmFmN0OPhKs
D:\training_dataset\id10286\mYc2BIS0zWU
D:\training_dataset\id10286\PlJd6NVoigs
D:\training_dataset\id10286\pqJw3UuOuCY
D:\training_dataset\id10286\Qkll9n7U5ak
D:\training_dataset\id10286\rPKjog9010U
D:\training_dataset\id10286\yAe_ZhyWdnI
D:\training_dataset\id10286\ZAK9gLYNqPs
D:\training_dataset\id10287\2iaCNe7ajII
D:\training_dataset\id10287\4oOmqI1myzY
D:\training_dataset\id10287\4SgMBa9MUvc
D:\training_dataset\id10287\8vC0LQMqy8Y
D:\training_dataset\id10287\bP0bKbQQlzc
D:\training_dataset\id10287\I99R40TUF1s


D:\training_dataset\id10297\o1EKUvjvNHY
D:\training_dataset\id10297\p08spQszEYU
D:\training_dataset\id10297\utAY0zpsv1U
D:\training_dataset\id10297\uxHwuJ59xNk
D:\training_dataset\id10298\3rge7xFekcQ
D:\training_dataset\id10298\6qFnVechX9o
D:\training_dataset\id10298\8v0-r8HKI_0
D:\training_dataset\id10298\9o3HnyKpHLM
D:\training_dataset\id10298\a0S-epXYJME
D:\training_dataset\id10298\aEuSKFj0xo8
D:\training_dataset\id10298\Cjt5DP6qc5Q
D:\training_dataset\id10298\DWT9P35cXT4
D:\training_dataset\id10298\f5eaTNf7-io
D:\training_dataset\id10298\gNR5NHzd3o0
D:\training_dataset\id10298\H5X-KaeTzEM
D:\training_dataset\id10298\hjvQiiG71rM
D:\training_dataset\id10298\hSC1Yb953Sg
D:\training_dataset\id10298\KaNukQi4imU
D:\training_dataset\id10298\KnOFOXpwmp0
D:\training_dataset\id10298\qrJ0fQJP5gs
D:\training_dataset\id10298\SCYPNx1H2pM
D:\training_dataset\id10298\U_1vfIVOigA
D:\training_dataset\id10298\VHtcwtdPjpE
D:\training_dataset\id10299\752GDDvGPCA
D:\training_dataset\id10299\DdZRjmvZKbI


D:\training_dataset\id10308\sy19cFuDvX8
D:\training_dataset\id10308\Tzs_CTbHT9Y
D:\training_dataset\id10308\vbxURBaCGr0
D:\training_dataset\id10308\We7oIBrKQKg
D:\training_dataset\id10308\XQWpVt5n3Ic
D:\training_dataset\id10308\ygGJs_Rh7ds
D:\training_dataset\id10309\0b1inHMAr6o
D:\training_dataset\id10309\0cYFdtyWVds
D:\training_dataset\id10309\0wLebLY6tcM
D:\training_dataset\id10309\3qX7bbJ17lY
D:\training_dataset\id10309\e-IdJ8a4gy4
D:\training_dataset\id10309\GyJHpVQmcvc
D:\training_dataset\id10309\MJ3skMKNBGM
D:\training_dataset\id10309\nMZkxUQ1RqI
D:\training_dataset\id10309\pwfqGqgezH4
D:\training_dataset\id10309\qFrRfhWombs
D:\training_dataset\id10309\RkN41E3gRpc
D:\training_dataset\id10309\rqaAm4bEsXc
D:\training_dataset\id10309\RWpuDFuKFjY
D:\training_dataset\id10309\rxnN8thYzEQ
D:\training_dataset\id10309\tGEWD2GaiDw
D:\training_dataset\id10309\uTOEzdTJNmU
D:\training_dataset\id10309\vobW27_-JyQ
D:\training_dataset\id10309\VxiCsd0gA6o
D:\training_dataset\id10309\V_tzoDBiP8A


D:\training_dataset\id10321\FWl8i5z7Omc
D:\training_dataset\id10321\GtUep_eTQ7U
D:\training_dataset\id10321\hBjeJUlFwsk
D:\training_dataset\id10321\lHfkWwCIT1E
D:\training_dataset\id10321\ljoqep7Io4Q
D:\training_dataset\id10321\lVrq3CHePwo
D:\training_dataset\id10321\n8VLoG1dTWM
D:\training_dataset\id10321\o4sTVkDiA1I
D:\training_dataset\id10321\oinBZMkxgts
D:\training_dataset\id10321\qAkrKOCOAdc
D:\training_dataset\id10321\sM6t2znWSo0
D:\training_dataset\id10321\SpxA79ojPY4
D:\training_dataset\id10321\VCCUJbbbJ4A
D:\training_dataset\id10321\x6bYDSh6EQI
D:\training_dataset\id10321\_mUc_Xl3gdM
D:\training_dataset\id10323\137siDbXp0I
D:\training_dataset\id10323\6zlsqpaYJdI
D:\training_dataset\id10323\kmnrEbUqNpI
D:\training_dataset\id10323\nrShtAbues4
D:\training_dataset\id10323\oTbfU_4YKdM
D:\training_dataset\id10323\QNGljKaw7pI
D:\training_dataset\id10323\VG9nNnr_a2o
D:\training_dataset\id10323\yyquFD8EKbA
D:\training_dataset\id10323\_bWQM4g06NY
D:\training_dataset\id10324\1BN1Twr0pDM


D:\training_dataset\id10334\alA_YRxKZCg
D:\training_dataset\id10334\C_JlIlY7Pbg
D:\training_dataset\id10334\gdS156IsqgE
D:\training_dataset\id10334\S1EWztcS8OE
D:\training_dataset\id10334\SW59jYm9Inw
D:\training_dataset\id10334\sWyVJdigpjc
D:\training_dataset\id10334\VpmzPfEfmb8
D:\training_dataset\id10334\VQBnetylQTI
D:\training_dataset\id10334\x7wfgqZQT5M
D:\training_dataset\id10334\ylLusLP--Ak
D:\training_dataset\id10334\YrI8Bv-ukgI
D:\training_dataset\id10334\YVXdKnu0s6k
D:\training_dataset\id10334\ZhhqTUZeIlU
D:\training_dataset\id10335\3j5n_FdTUg4
D:\training_dataset\id10335\5OLzRQHT290
D:\training_dataset\id10335\6lFw-tpQPLA
D:\training_dataset\id10335\8vwmMOAb5F8
D:\training_dataset\id10335\AY9apGnb0eM
D:\training_dataset\id10335\dCdRPb2_lMg
D:\training_dataset\id10335\DofYixK7wUQ
D:\training_dataset\id10335\expqXRg1Uyk
D:\training_dataset\id10335\GmZQ6Y0FSPA
D:\training_dataset\id10335\I9Hq0ZJ65l0
D:\training_dataset\id10335\IeLbPRXujaU
D:\training_dataset\id10335\JGgisII8aP4


D:\training_dataset\id10348\htGoC9IiAPc
D:\training_dataset\id10348\J5FZ35Ja3hQ
D:\training_dataset\id10348\kDku13RPflI
D:\training_dataset\id10348\KIy0AGQSAJc
D:\training_dataset\id10348\L0y-9ignSgs
D:\training_dataset\id10348\MNKD9ZPx8OY
D:\training_dataset\id10348\UIkqT7O1Ij8
D:\training_dataset\id10348\ydO2ZO-6-yU
D:\training_dataset\id10349\8VlMBWsbL0o
D:\training_dataset\id10349\CkdpPS1E5Ks
D:\training_dataset\id10349\ebmuSH4EpLc
D:\training_dataset\id10349\gO_tKqtX4sc
D:\training_dataset\id10349\h2XXZOzcqzM
D:\training_dataset\id10349\jQglnsLZ3R0
D:\training_dataset\id10349\QGlexcvxGFo
D:\training_dataset\id10349\vwHG7YVqIpE
D:\training_dataset\id10350\5pa13peHVtk
D:\training_dataset\id10350\6o-LScd5k38
D:\training_dataset\id10350\BppiewHcORc
D:\training_dataset\id10350\Ok_1huCJP0o
D:\training_dataset\id10350\pYi_jIz_VSw
D:\training_dataset\id10350\ryGNVsMrgBs
D:\training_dataset\id10350\SHWdWC0GJYo
D:\training_dataset\id10350\sVAA-cQ6JaA
D:\training_dataset\id10350\sxmVLwz2dGU


In [None]:
#The Dataset
data.head()

In [None]:
#Now we have to label each row with the gender correponding to the 'ID'
csv_label=pd.read_csv("C:\\Users\\SAGAR\\Desktop\\Final Year Project\\csv_dataset\\voice_dataset.csv")
csv_file=csv_label.to_numpy()
csv_file

Firstly,we read the csv file which contain only the 'ID' and 'gender'.
Then the 'ID' of the csv file and the dataset is compared.
If they are equal,the gender 'M' is added to one dataframe and gender 'F' is added to another dataframe
Then both the dataframes are concatenated and then the entire dataset is saved as csv file.

In [None]:
array_label=data.to_numpy()
id_1=[]
label_1=[]
pitch_1=[]
low_pitch_1=[]
high_pitch_1=[]
mean_1=[]
skew_1=[]
kurt_1=[]
median_1=[]
mode_1=[]
sd_1=[]
low_1=[]
peak_1=[]
q75_1=[]
q25_1=[]
iqr_1=[]
sp_cent_1=[]
sfm_1=[]
sp_rolloff_1=[]
sp_bandwidth_1=[]
id_0=[]
label_0=[]
pitch_0=[]
low_pitch_0=[]
high_pitch_0=[]
mean_0=[]
skew_0=[]
kurt_0=[]
median_0=[]
mode_0=[]
sd_0=[]
low_0=[]
peak_0=[]
q75_0=[]
q25_0=[]
iqr_0=[]
sp_cent_0=[]
sfm_0=[]
sp_rolloff_0=[]
sp_bandwidth_0=[]
for i in csv_file:
    label=i[1]
    id_i=i[0]
    for j in array_label:
        id_j=j[0]
        if id_i==id_j:
            if label=='m':
                id_1.append(id_j)
                mean_1.append(j[1])
                sd_1.append(j[2])
                median_1.append(j[3])
                q25_1.append(j[4])
                q75_1.append(j[5])
                iqr_1.append(j[6])
                skew_1.append(j[7])
                kurt_1.append(j[8])
                mode_1.append(j[9])
                low_1.append(j[10])
                peak_1.append(j[11])
                sp_cent_1.append(j[12])
                sfm_1.append(j[13])
                sp_rolloff_1.append(j[14])
                sp_bandwidth_1.append(j[15])
                label_1.append(label)
            elif label=='f':
                id_0.append(id_j)
                mean_0.append(j[1])
                sd_0.append(j[2])
                median_0.append(j[3])
                q25_0.append(j[4])
                q75_0.append(j[5])
                iqr_0.append(j[6])
                skew_0.append(j[7])
                kurt_0.append(j[8])
                mode_0.append(j[9])
                low_0.append(j[10])
                peak_0.append(j[11])
                sp_cent_0.append(j[12])
                sfm_0.append(j[13])
                sp_rolloff_0.append(j[14])
                sp_bandwidth_0.append(j[15])
                label_0.append(label)
csv_label_1=({"ID":id_1,'meanfreq':mean_1,'sd':sd_1,'median':median_1,'Q25':q25_1,'Q75':q75_1,'iqr':iqr_1,'skew':skew_1,'kurt':kurt_1,'mode':mode_1,'low':low_1,'peak':peak_1,'centroid':sp_cent_1,'flatness':sfm_1,'spectral rolloff':sp_rolloff_1,'spectral bandwidth':sp_bandwidth_1,"LABEL":label_1}) 
csv_label_0=({"ID":id_0,'meanfreq':mean_0,'sd':sd_0,'median':median_0,'Q25':q25_0,'Q75':q75_0,'iqr':iqr_0,'skew':skew_0,'kurt':kurt_0,'mode':mode_0,'low':low_0,'peak':peak_0,'centroid':sp_cent_0,'flatness':sfm_0,'spectral rolloff':sp_rolloff_0,'spectral bandwidth':sp_bandwidth_0,"LABEL":label_0})
csv_label_1=pd.DataFrame(csv_label_1)
csv_label_0=pd.DataFrame(csv_label_0)
df=pd.concat([csv_label_1,csv_label_0])
df.index=range(len(df))
df.head()

In [None]:
df.to_csv("C:\\Users\\SAGAR\\Desktop\\Final Year Project\\csv_dataset\\dataset.csv",index=False)

# Thank you!!!!