In [1]:
import pydub
import pandas as pd
import librosa
import numpy as np
import os

In [2]:
mp3_folder = "Datasets/clips/" # Location of mp3 audio files
wav_folder = "Datasets/wav_clips/" # Location where wav files will be saved
os.environ["PATH"] += os.pathsep + 'FFMPEG/bin' # Path to FFMPEG bin

In [None]:
i_limit = 1000
i = 0
dir = os.fsdecode(mp3_folder)
for path in os.listdir(dir):
    if i == i_limit:
        break
    src = os.path.abspath(os.path.join(dir, path))
    dst = wav_folder + path[:-3] + str("wav")
    sound = pydub.AudioSegment.from_mp3(src)
    sound.export(dst, format("wav"))
    i += 1

# Feature Extraction

In [3]:
pathname = "Datasets/validated.tsv"
output_pathname = "Datasets/validated_SpeechFeatures.csv"

In [4]:
df = pd.read_csv(pathname, sep="\t")
df

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
0,0013037a1d45cc33460806cc3f8ecee9d536c45639ba4c...,common_voice_en_699711.mp3,She'll be all right.,2,1,,,
1,003fb666a99eb3aa3ba05d9c8641c18e55cf7d34d1b981...,common_voice_en_17263741.mp3,Do you mean it?,2,0,,,
2,0047f1aea3f39c4c6a9298d84f046c1f84f439f594d840...,common_voice_en_17561821.mp3,How is Mozilla going to handle ambiguities lik...,2,0,,,
3,00610bda27826b8615139e6a430d43382935f37922dc1c...,common_voice_en_59751.mp3,I guess you must think I'm kinda batty.,2,1,,,
4,007762ba618b38ed77739616c7a6ace0341e1e134ac405...,common_voice_en_18902772.mp3,Groves started writing songs when she was four...,2,1,,,
...,...,...,...,...,...,...,...,...
644114,939991dfcf12d8250739f3b4e5ed47e0dc4bf8ffb69810...,common_voice_en_18792608.mp3,It is almost exclusively in the final version ...,2,1,twenties,male,
644115,939991dfcf12d8250739f3b4e5ed47e0dc4bf8ffb69810...,common_voice_en_18792609.mp3,All songs written by Mark Greaney.,2,0,twenties,male,
644116,939991dfcf12d8250739f3b4e5ed47e0dc4bf8ffb69810...,common_voice_en_18792611.mp3,Later the ninth and tenth grades were added.,2,1,twenties,male,
644117,939991dfcf12d8250739f3b4e5ed47e0dc4bf8ffb69810...,common_voice_en_18792612.mp3,"Aull, who named it Quasi-Rolle theorem.",2,1,twenties,male,


In [5]:
df = df.dropna(subset=["age","gender","accent"]) # This will DEFINITELY lead to a smaller sample size and less reliable results, but for simplicity's and comparison's sake, let's only go with the data that also includes accents and ages.

# df = df[df.gender != "other"] # Let's not remove these right away. We can run an experiment on whether models are affected by this or not.

df = df[["up_votes","down_votes","path", "age", "gender","accent"]] # Might be a good idea to keep up_votes nad down_votes as well, so I added these.

df

Unnamed: 0,up_votes,down_votes,path,age,gender,accent
8,2,0,common_voice_en_18489793.mp3,thirties,male,us
11,2,0,common_voice_en_17779714.mp3,thirties,male,canada
40,2,1,common_voice_en_125399.mp3,seventies,female,england
44,2,1,common_voice_en_18274221.mp3,twenties,male,us
54,2,0,common_voice_en_17147389.mp3,twenties,male,us
...,...,...,...,...,...,...
638031,2,0,common_voice_en_18829416.mp3,sixties,male,us
638032,2,0,common_voice_en_18829417.mp3,sixties,male,us
638033,2,0,common_voice_en_18829418.mp3,sixties,male,us
638034,2,0,common_voice_en_18829419.mp3,sixties,male,us


In [6]:
# New Columns
column_chroma_stft = [] #chromagram https://en.wikipedia.org/wiki/Chroma_feature
column_rms = [] #root mean square https://en.wikipedia.org/wiki/Root_mean_square#In_common_waveforms
column_spec_cent = [] #spectral centroid https://en.wikipedia.org/wiki/Spectral_centroid
column_spec_bw = [] #spectral bandwidth https://en.wikipedia.org/wiki/Bandwidth_(signal_processing)#x_dB_bandwidth
column_rolloff = [] #spectral roll-off https://en.wikipedia.org/wiki/Roll-off
column_zcr = [] #zero-crossing rate https://en.wikipedia.org/wiki/Zero-crossing_rate
column_mfcc = [] #Mel-frequency cepstrum coefficients https://en.wikipedia.org/wiki/Mel-frequency_cepstrum

In [8]:
# Get all audio file paths and names
wav_FilePaths = []
mp3_FileNames = []
dir = os.fsdecode(wav_folder)
for path in os.listdir(dir):
    src = os.path.abspath(os.path.join(dir, path))
    name = path[:-3] + str("mp3")
    wav_FilePaths.append(src)
    mp3_FileNames.append(name)

In [9]:
# Remove all entries from dataframe where we don't have the given audio file
df = df[df["path"].isin(mp3_FileNames)]
dictionary = dict(zip(mp3_FileNames, wav_FilePaths))

In [12]:
# Get speech features, add them to the dataframe and export to csv file
for index, row in df.iterrows():
        src = dictionary.get(row.path)
        y, sr = librosa.load(src)
        
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=y, sr=sr)) #https://librosa.github.io/librosa/generated/librosa.feature.chroma_stft.html
        rms = np.mean(librosa.feature.rms(y=y)) #https://librosa.github.io/librosa/generated/librosa.feature.rms.html
        spec_cent = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)) #https://librosa.github.io/librosa/generated/librosa.feature.spectral_centroid.html 
        spec_bw = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr)) #https://librosa.github.io/librosa/generated/librosa.feature.spectral_bandwidth.html
        rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)) #https://librosa.github.io/librosa/generated/librosa.feature.spectral_rolloff.html
        zcr = np.mean(librosa.feature.zero_crossing_rate(y)) #https://librosa.github.io/librosa/generated/librosa.feature.zero_crossing_rate.html
        mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr)) #https://librosa.github.io/librosa/generated/librosa.feature.mfcc.html
        
        column_chroma_stft.append(chroma_stft)
        column_rms.append(rms)
        column_spec_cent.append(spec_cent)
        column_spec_bw.append(spec_bw)
        column_rolloff.append(rolloff)
        column_zcr.append(zcr)
        column_mfcc.append(mfcc)

In [62]:
# Add columns to dataframe
df = df.assign(chroma_stft=column_chroma_stft)
df = df.assign(rms=column_rms)
df = df.assign(spec_cent=column_spec_cent)
df = df.assign(spec_bw=column_spec_bw)
df = df.assign(rolloff=column_rolloff)
df = df.assign(zcr=column_zcr)
df = df.assign(mfcc=column_mfcc)

In [63]:
df.to_csv(output_pathname, encoding="utf-8", index=False)

In [64]:
df

Unnamed: 0,"up_votes,down_votes,path,age,gender,accent,chroma_stft,rms,spec_cent,spec_bw,rolloff,zcr,mfcc",chroma_stft,rms,spec_cent,spec_bw,rolloff,zcr,mfcc
0,"2,0,common_voice_en_10091129.mp3,fifties,male,...",0.472234,0.024861,2307.869775,2132.580959,4797.588375,0.136395,-24.520874
1,"2,0,common_voice_en_10091130.mp3,fifties,male,...",0.415472,0.049218,1716.522632,1573.969255,3361.978055,0.098426,-17.076149
2,"3,0,common_voice_en_100737.mp3,thirties,male,b...",0.418802,0.070170,2510.204412,2305.931293,4676.633433,0.145721,-14.471638
3,"3,1,common_voice_en_100738.mp3,thirties,male,b...",0.346082,0.117950,2173.955546,2212.128879,3976.626141,0.115372,-9.808111
4,"3,0,common_voice_en_100739.mp3,thirties,male,b...",0.381020,0.087444,2312.963349,2266.425099,4328.682123,0.134091,-13.030169
...,...,...,...,...,...,...,...,...
1187,"2,0,common_voice_en_10209062.mp3,fifties,femal...",0.439742,0.033243,2970.832932,2662.461360,6184.766602,0.167612,-22.828602
1188,"2,0,common_voice_en_10209063.mp3,fifties,femal...",0.399398,0.026217,2760.807520,2266.275394,5187.672335,0.177096,-21.018087
1189,"2,0,common_voice_en_10209065.mp3,fifties,femal...",0.388222,0.024172,2230.666062,2299.027168,4676.918096,0.113927,-20.731462
1190,"2,0,common_voice_en_10209066.mp3,fifties,femal...",0.439761,0.021787,2918.167471,2388.274420,5495.593766,0.187932,-19.918543


# Unsure we're handling some features correctly
## Or what does it mean to take means

In [47]:
s = (np.random.normal(0, 1, 1000)/(3/2))+0.5
print("mean",np.mean(s))
print("standard deviation",np.std(s))

mean 0.5252996308323116
standard deviation 0.6551467598389811


In [18]:
row = df.iloc[0,:]
src = dictionary.get(row.path)
y, sr = librosa.load(src)

In [33]:
chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
pd.DataFrame(chroma_stft)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,222,223,224,225,226,227,228,229,230,231
0,0.092808,0.969735,0.625971,0.448994,0.578346,0.349611,0.137022,0.104749,0.478707,0.727236,...,0.231078,0.043716,0.049291,0.062192,0.190192,0.219637,0.580808,1.0,1.0,1.0
1,0.299524,0.924995,0.694071,0.561201,0.589821,0.083369,0.045389,0.095905,0.424037,0.546857,...,0.161246,0.08292,0.095235,0.084936,0.140475,0.1569,0.352587,0.65944,0.647527,0.831388
2,0.442416,0.96085,0.804859,0.698368,0.571954,0.048179,0.021514,0.073076,0.348865,0.565681,...,0.291782,0.393215,0.411487,0.255199,0.235687,0.405756,0.416473,0.18218,0.161548,0.490786
3,0.212672,0.972903,0.919992,0.777452,0.407909,0.04974,0.019823,0.060195,0.362948,0.836914,...,0.569673,0.678537,0.657563,0.441783,0.667875,0.49666,0.609432,0.286075,0.176542,0.475637
4,0.767374,0.925742,1.0,0.915062,0.353211,0.058762,0.029072,0.083646,0.390235,0.833481,...,0.645342,0.821314,0.967446,0.924943,0.699732,0.329572,0.425604,0.433617,0.187024,0.288696
5,1.0,0.892966,0.965685,1.0,0.655336,0.256988,0.241711,0.25649,0.454515,0.681706,...,0.5422,0.490669,0.713784,1.0,0.856976,0.414975,0.253835,0.170997,0.150512,0.328004
6,0.095433,0.810632,0.798375,0.900938,1.0,1.0,1.0,1.0,0.688468,0.73432,...,0.676281,0.367679,0.506041,0.710014,1.0,1.0,1.0,0.454668,0.187539,0.322597
7,0.002001,0.812836,0.712568,0.727313,0.719604,0.580178,0.528054,0.7255,0.534379,0.835946,...,0.834344,0.45023,0.461015,0.401598,0.491643,0.505544,0.948747,0.766988,0.25322,0.29954
8,0.002395,0.859011,0.698211,0.623854,0.415313,0.115997,0.084972,0.273825,0.765361,0.808704,...,1.0,1.0,1.0,0.601992,0.333028,0.211197,0.235076,0.231407,0.171895,0.199626
9,0.004477,0.946201,0.826623,0.825434,0.656463,0.090997,0.052723,0.157955,1.0,1.0,...,0.575498,0.687227,0.763034,0.526796,0.35544,0.167536,0.212863,0.209048,0.21897,0.359444


In [34]:
print("mean",np.mean(chroma_stft))
print("standard deviation",np.std(chroma_stft))

mean 0.4722341
standard deviation 0.29287666


In [23]:
rms = librosa.feature.rms(y=y)
rms

array([[5.23070580e-11, 1.23472698e-03, 1.36642007e-03, 1.43705262e-03,
        1.58859720e-03, 1.09332416e-03, 9.61533282e-04, 8.82173947e-04,
        5.80175431e-04, 3.99295153e-04, 3.35834717e-04, 2.98594910e-04,
        3.07949726e-04, 2.93874997e-04, 2.62996648e-04, 2.10651488e-04,
        1.68391634e-04, 1.49758707e-04, 1.30224173e-04, 1.30901564e-04,
        1.17825606e-04, 1.06681153e-04, 1.06673026e-04, 9.42955376e-05,
        8.16094980e-05, 7.25632999e-05, 6.42960294e-05, 6.55515978e-05,
        6.54450705e-05, 6.68952052e-05, 6.86827771e-05, 7.36991569e-05,
        7.62500204e-05, 7.49695246e-05, 6.77734715e-05, 5.83688852e-05,
        5.11180078e-05, 4.58020841e-05, 4.53473513e-05, 4.19703138e-05,
        3.99896744e-05, 4.27330087e-05, 4.62597382e-05, 4.60725896e-05,
        5.73041034e-05, 6.59636353e-05, 7.59973846e-05, 7.77398382e-05,
        7.00828241e-05, 6.28848429e-05, 4.81806092e-05, 4.69450169e-05,
        4.61879572e-05, 5.00249953e-05, 7.30182364e-05, 7.830854

In [48]:
print("mean",np.mean(rms))
print("standard deviation",np.std(rms))

mean 0.02486109
standard deviation 0.049865287


In [24]:
spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
spec_cent

array([[9518.95805287, 1409.07262943, 1323.93803226, 1367.97873252,
        1526.05859538, 1388.93699144, 1326.32967444, 1547.48194008,
        1683.20379009, 1788.41747996, 1888.54476276, 1841.11658415,
        1747.43762027, 1760.55185573, 1800.80723056, 2006.80326737,
        2144.02213934, 2147.5554578 , 2062.23500711, 2180.21591445,
        2427.3048416 , 2394.41290129, 2479.37998303, 2506.43231215,
        2700.11563169, 2839.80674872, 2975.81745696, 2917.69338035,
        2814.07202903, 2913.09071452, 2886.54053321, 2826.12822615,
        2686.32327339, 2673.83110781, 2862.79551415, 3040.05645472,
        3179.18415456, 2970.61472021, 3072.96666455, 3188.25089488,
        3270.13658357, 3255.14109794, 3074.66795605, 2870.76011302,
        2937.64091428, 2824.90694803, 2647.30043911, 2628.3125724 ,
        2630.28696797, 3070.38702986, 3030.16995013, 3098.037113  ,
        3068.31086121, 3090.98402222, 2894.90485677, 2450.90921503,
        2547.42185985, 3124.71487679, 3042.52220

In [49]:
print("mean",np.mean(spec_cent))
print("standard deviation",np.std(spec_cent))

mean 2307.8697750009296
standard deviation 1054.622576363295


In [25]:
spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
spec_bw

array([[1386.89475924, 2152.34167747, 2043.98309208, 2015.61295456,
        1997.16399176, 1914.14995901, 1867.9695255 , 2029.72549271,
        2114.68305023, 2137.64538551, 2151.80346378, 2153.53421537,
        2153.84762191, 2183.62043749, 2192.94542125, 2283.04418177,
        2402.76231066, 2377.16527672, 2367.68318163, 2516.66606242,
        2655.47021257, 2679.91291243, 2718.9182721 , 2757.84611919,
        2777.93930142, 2737.14014167, 2726.14498715, 2688.27953496,
        2696.26120602, 2721.18421971, 2735.37451863, 2737.25788588,
        2746.34823105, 2744.04177438, 2712.02363265, 2809.59996383,
        2928.66077459, 2895.17530732, 2868.31056162, 2911.84828438,
        2937.23387642, 2955.93527639, 2900.71542295, 2799.839428  ,
        2820.57055692, 2878.56199506, 2897.18199132, 2840.42499387,
        2818.84591559, 2905.2309389 , 2721.5850062 , 2729.88496045,
        2806.89829915, 2864.94672672, 2877.35820523, 2807.60275985,
        2780.37263247, 2820.31530786, 2865.52336

In [50]:
print("mean",np.mean(spec_bw))
print("standard deviation",np.std(spec_bw))

mean 2132.5809592599444
standard deviation 863.2530521489168


In [26]:
rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
rolloff

array([[10777.36816406,  3466.84570312,  3229.98046875,  3359.1796875 ,
         3800.61035156,  3283.81347656,  3046.94824219,  3886.74316406,
         4220.5078125 ,  4371.24023438,  4403.54003906,  4317.40722656,
         4338.94042969,  4360.47363281,  4360.47363281,  4823.4375    ,
         5006.46972656,  4855.73730469,  4855.73730469,  5329.46777344,
         5749.36523438,  5717.06542969,  5857.03125   ,  5964.69726562,
         6212.32910156,  6244.62890625,  6406.12792969,  6330.76171875,
         6212.32910156,  6416.89453125,  6363.06152344,  6223.09570312,
         6104.66308594,  6061.59667969,  6126.19628906,  6481.49414062,
         6836.79199219,  6599.92675781,  6653.75976562,  6815.25878906,
         6933.69140625,  6901.39160156,  6675.29296875,  6373.828125  ,
         6503.02734375,  6535.32714844,  6395.36132812,  6255.39550781,
         6147.72949219,  6772.19238281,  6449.19433594,  6438.42773438,
         6567.62695312,  6750.65917969,  6535.32714844,  6040.06

In [51]:
print("mean",np.mean(rolloff))
print("standard deviation",np.std(rolloff))

mean 4797.588374696928
standard deviation 2192.819491683409


In [27]:
zcr = librosa.feature.zero_crossing_rate(y)
zcr

array([[0.00292969, 0.10253906, 0.12597656, 0.140625  , 0.1484375 ,
        0.07519531, 0.06884766, 0.07666016, 0.09179688, 0.08398438,
        0.08496094, 0.07519531, 0.06445312, 0.07617188, 0.08105469,
        0.12011719, 0.12109375, 0.11035156, 0.11523438, 0.08544922,
        0.08837891, 0.09716797, 0.09472656, 0.12255859, 0.14746094,
        0.171875  , 0.20117188, 0.19238281, 0.19726562, 0.19384766,
        0.17919922, 0.16552734, 0.15527344, 0.15722656, 0.17333984,
        0.18261719, 0.18945312, 0.20849609, 0.22021484, 0.25048828,
        0.25390625, 0.24169922, 0.21142578, 0.20849609, 0.1953125 ,
        0.15771484, 0.13183594, 0.10449219, 0.11132812, 0.13720703,
        0.17285156, 0.19384766, 0.20654297, 0.20654297, 0.18212891,
        0.15429688, 0.15478516, 0.13818359, 0.12060547, 0.09716797,
        0.046875  , 0.02050781, 0.02197266, 0.03417969, 0.04736328,
        0.06884766, 0.07958984, 0.07958984, 0.07763672, 0.06542969,
        0.06591797, 0.06103516, 0.05371094, 0.05

In [52]:
print("mean",np.mean(zcr))
print("standard deviation",np.std(zcr))

mean 0.1363946322737069
standard deviation 0.06941451964822382


In [35]:
mfcc = librosa.feature.mfcc(y=y, sr=sr)
pd.DataFrame(mfcc)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,222,223,224,225,226,227,228,229,230,231
0,-603.955078,-593.291077,-520.283997,-493.132751,-518.945435,-539.89386,-549.664001,-556.873535,-562.485107,-568.431763,...,-592.867493,-533.118347,-513.516296,-541.743591,-584.281372,-601.854004,-602.223389,-600.090393,-601.786255,-603.897644
1,0.0,14.779954,89.300186,101.966125,87.014832,72.95919,64.123055,55.553871,48.993706,42.619125,...,8.788815,-26.203707,-26.060612,-4.8419,4.684973,0.830393,1.887345,4.736764,2.813769,0.072105
2,0.0,13.923598,41.302555,35.473755,39.403557,41.448875,41.614273,36.518318,31.500027,26.170435,...,4.357154,-8.739331,-10.564333,1.815087,6.309908,0.786547,0.487786,2.815055,2.107842,0.046774
3,0.0,12.643251,17.734142,13.732781,7.801126,11.760794,17.610262,15.805918,13.094135,10.454675,...,1.462308,36.532143,42.009651,26.115713,3.856902,-0.463242,-1.053445,0.387015,1.099836,0.010932
4,0.0,11.124739,16.807842,10.641047,1.377396,-1.228355,4.559023,4.461003,3.53318,2.290278,...,-6.259363,-50.552269,-62.43,-44.475296,-15.125933,-2.297499,-2.009535,-1.712039,0.002259,-0.027366
5,0.0,9.568108,16.310585,16.375975,11.428601,-0.984918,1.519094,2.277765,2.328613,0.416915,...,-5.142308,0.075256,-5.554039,-7.361488,0.593995,-0.731247,-1.995154,-2.822021,-0.958395,-0.059515
6,0.0,8.146008,2.397642,4.689191,4.37899,-4.679338,-3.189314,-0.802338,0.440026,-0.683336,...,-2.007216,1.306145,-2.088471,-8.088541,-6.523582,-1.510866,-1.121629,-2.696471,-1.592593,-0.07829
7,0.0,6.970946,-6.216641,-1.62841,-0.403752,-7.131748,-5.78513,-1.181959,-0.155049,-2.49578,...,0.097673,10.828299,14.294389,8.125093,5.246885,0.665396,0.124064,-1.560954,-1.789901,-0.079471
8,0.0,6.078833,-1.452092,-0.7814,-5.214429,-13.678932,-12.853968,-6.753303,-4.265686,-5.232105,...,1.891505,10.717493,13.573292,18.966496,13.798908,1.645865,1.167819,0.021845,-1.542641,-0.062792
9,0.0,5.43184,7.150033,4.209261,-2.608333,-14.011948,-13.712798,-8.284541,-3.781065,-5.697572,...,2.498758,-0.2787,1.748924,4.719319,2.250097,1.356994,1.620045,1.409812,-0.945764,-0.032005


In [53]:
print("mean",np.mean(mfcc))
print("standard deviation",np.std(mfcc))

mean -24.520874
standard deviation 117.96407


In [58]:
def present_values(row):
    src = dictionary.get(row.path)
    y, sr = librosa.load(src)
    
    print("CHROMA_STFT")
    s = librosa.feature.chroma_stft(y=y, sr=sr)
    print("mu",np.mean(s))
    print("std",np.std(s))
    print("RMS")
    s = librosa.feature.rms(y=y)
    print("mu",np.mean(s))
    print("std",np.std(s))
    print("SPEC_CENT")
    s = librosa.feature.spectral_centroid(y=y, sr=sr)
    print("mu",np.mean(s))
    print("std",np.std(s))
    print("SPEC_BW")
    s = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    print("mu",np.mean(s))
    print("std",np.std(s))
    print("ROLLOFF")
    s = librosa.feature.spectral_rolloff(y=y, sr=sr)
    print("mu",np.mean(s))
    print("std",np.std(s))
    print("ZCR")
    s = librosa.feature.zero_crossing_rate(y)
    print("mu",np.mean(s))
    print("std",np.std(s))
    print("MFCC")
    s = librosa.feature.mfcc(y=y, sr=sr)
    print("mu",np.mean(s))
    print("std",np.std(s))
    print()


In [59]:
present_values(df.iloc[1,:])
present_values(df.iloc[2,:])
present_values(df.iloc[3,:])

CHROMA_STFT
mu 0.41547176
std 0.29748425
RMS
mu 0.04921781
std 0.056908384
SPEC_CENT
mu 1716.5226324891005
std 831.6682065660448
SPEC_BW
mu 1573.9692546587892
std 702.5657461799505
ROLLOFF
mu 3361.9780553069936
std 1688.8124031420805
ZCR
mu 0.09842631470264317
std 0.059409840180946874
MFCC
mu -17.076149
std 99.626816

CHROMA_STFT
mu 0.41880167
std 0.30273342
RMS
mu 0.07016978
std 0.07234158
SPEC_CENT
mu 2510.204411941715
std 1397.4190188000432
SPEC_BW
mu 2305.931292506095
std 440.7951105755935
ROLLOFF
mu 4676.633432749155
std 2203.9012425985393
ZCR
mu 0.14572116061373874
std 0.11722953256622383
MFCC
mu -14.471638
std 78.529625

CHROMA_STFT
mu 0.34608158
std 0.30716756
RMS
mu 0.117950104
std 0.08522534
SPEC_CENT
mu 2173.9555456975413
std 1327.5899866916104
SPEC_BW
mu 2212.128879426947
std 505.6201797101605
ROLLOFF
mu 3976.6261411516853
std 2268.469148657458
ZCR
mu 0.11537153265449439
std 0.10098572296266004
MFCC
mu -9.808111
std 60.242607



# Modelling

In [67]:
df = pd.read_csv(output_pathname, sep=",")

In [68]:
df

Unnamed: 0,"up_votes,down_votes,path,age,gender,accent,chroma_stft,rms,spec_cent,spec_bw,rolloff,zcr,mfcc",chroma_stft,rms,spec_cent,spec_bw,rolloff,zcr,mfcc
0,"2,0,common_voice_en_10091129.mp3,fifties,male,...",0.472234,0.024861,2307.869775,2132.580959,4797.588375,0.136395,-24.520874
1,"2,0,common_voice_en_10091130.mp3,fifties,male,...",0.415472,0.049218,1716.522632,1573.969255,3361.978055,0.098426,-17.076149
2,"3,0,common_voice_en_100737.mp3,thirties,male,b...",0.418802,0.070170,2510.204412,2305.931293,4676.633433,0.145721,-14.471638
3,"3,1,common_voice_en_100738.mp3,thirties,male,b...",0.346082,0.117950,2173.955546,2212.128879,3976.626141,0.115372,-9.808111
4,"3,0,common_voice_en_100739.mp3,thirties,male,b...",0.381020,0.087444,2312.963349,2266.425099,4328.682123,0.134091,-13.030169
...,...,...,...,...,...,...,...,...
1187,"2,0,common_voice_en_10209062.mp3,fifties,femal...",0.439742,0.033243,2970.832932,2662.461360,6184.766602,0.167612,-22.828602
1188,"2,0,common_voice_en_10209063.mp3,fifties,femal...",0.399398,0.026217,2760.807520,2266.275394,5187.672335,0.177096,-21.018087
1189,"2,0,common_voice_en_10209065.mp3,fifties,femal...",0.388222,0.024172,2230.666062,2299.027168,4676.918096,0.113927,-20.731462
1190,"2,0,common_voice_en_10209066.mp3,fifties,femal...",0.439761,0.021787,2918.167471,2388.274420,5495.593766,0.187932,-19.918543


In [77]:
df.mean()/df.std() #Standard deviation compared to mean

chroma_stft    8.222928
rms            1.329743
spec_cent      4.155171
spec_bw        5.293673
rolloff        3.929714
zcr            3.116444
mfcc          -4.162939
dtype: float64