# Import required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

  from ._conv import register_converters as _register_converters


# Loading the dataset

In [2]:
data = pd.read_csv("../data/internal/voice.csv")

# Look at the dataset

In [3]:
data.head(7)

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,male
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male
5,0.132786,0.079557,0.11909,0.067958,0.209592,0.141634,1.932562,8.308895,0.963181,0.738307,...,0.132786,0.110132,0.017112,0.253968,0.298222,0.007812,2.726562,2.71875,0.12516,male
6,0.150762,0.074463,0.160106,0.092899,0.205718,0.112819,1.530643,5.987498,0.967573,0.762638,...,0.150762,0.105945,0.02623,0.266667,0.47962,0.007812,5.3125,5.304688,0.123992,male


# A look at the columns

In [4]:
data.columns

Index(['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'skew', 'kurt',
       'sp.ent', 'sfm', 'mode', 'centroid', 'meanfun', 'minfun', 'maxfun',
       'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx', 'label'],
      dtype='object')

# Size of dataset

In [5]:
data.size

66528

# Description of the dataset

In [6]:
data.describe()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,mode,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx
count,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0
mean,0.180907,0.057126,0.185621,0.140456,0.224765,0.084309,3.140168,36.568461,0.895127,0.408216,0.165282,0.180907,0.142807,0.036802,0.258842,0.829211,0.052647,5.047277,4.99463,0.173752
std,0.029918,0.016652,0.03636,0.04868,0.023639,0.042783,4.240529,134.928661,0.04498,0.177521,0.077203,0.029918,0.032304,0.01922,0.030077,0.525205,0.063299,3.521157,3.520039,0.119454
min,0.039363,0.018363,0.010975,0.000229,0.042946,0.014558,0.141735,2.068455,0.738651,0.036876,0.0,0.039363,0.055565,0.009775,0.103093,0.007812,0.004883,0.007812,0.0,0.0
25%,0.163662,0.041954,0.169593,0.111087,0.208747,0.04256,1.649569,5.669547,0.861811,0.258041,0.118016,0.163662,0.116998,0.018223,0.253968,0.419828,0.007812,2.070312,2.044922,0.099766
50%,0.184838,0.059155,0.190032,0.140286,0.225684,0.09428,2.197101,8.318463,0.901767,0.396335,0.186599,0.184838,0.140519,0.04611,0.271186,0.765795,0.023438,4.992188,4.945312,0.139357
75%,0.199146,0.06702,0.210618,0.175939,0.24366,0.114175,2.931694,13.648905,0.928713,0.533676,0.221104,0.199146,0.169581,0.047904,0.277457,1.177166,0.070312,7.007812,6.992188,0.209183
max,0.251124,0.115273,0.261224,0.247347,0.273469,0.252225,34.725453,1309.612887,0.981997,0.842936,0.28,0.251124,0.237636,0.204082,0.279114,2.957682,0.458984,21.867188,21.84375,0.932374


# Data Preprocessing

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data["label"] = le.fit_transform(data["label"])
le.classes_

array(['female', 'male'], dtype=object)

In [8]:
data[:]=preprocessing.MinMaxScaler().fit_transform(data)
data.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.096419,0.473409,0.084125,0.060063,0.204956,0.254828,0.367853,0.208279,0.635798,0.564526,...,0.096419,0.157706,0.030501,0.981526,0.0,0.006452,0.0,0.0,0.0,1.0
1,0.125828,0.505075,0.1169,0.077635,0.215683,0.246961,0.644279,0.483766,0.630964,0.591578,...,0.125828,0.287642,0.03114,0.8346,0.000407,0.006452,0.002144,0.002146,0.056449,1.0
2,0.179222,0.675536,0.102873,0.034284,0.385912,0.457148,0.885255,0.782275,0.442738,0.548382,...,0.179222,0.236945,0.030264,0.954963,6e-05,0.006452,0.000357,0.000358,0.049885,1.0
3,0.528261,0.554611,0.587559,0.389906,0.715802,0.407358,0.031549,0.001613,0.923261,0.856457,...,0.528261,0.183442,0.041287,0.8346,0.065659,0.006452,0.025375,0.025393,0.265043,1.0
4,0.452195,0.627209,0.454272,0.317627,0.707515,0.474474,0.027742,0.001732,0.958736,0.926348,...,0.452195,0.27919,0.036829,0.929285,0.238994,0.006452,0.250536,0.250715,0.22338,1.0


In [58]:
data.corr()['label']

meanfreq   -0.337415
sd          0.479539
median     -0.283919
Q25        -0.511455
Q75         0.066906
IQR         0.618916
skew        0.036627
kurt        0.087195
sp.ent      0.490552
sfm         0.357499
mode       -0.171775
centroid   -0.337415
meanfun    -0.833921
minfun     -0.136692
maxfun     -0.166461
meandom    -0.191067
mindom     -0.194974
maxdom     -0.195657
dfrange    -0.192213
modindx     0.030801
label       1.000000
Name: label, dtype: float64

In [62]:
high_corr = []
i =0
for e in data.corr()['label'].where((abs(data.corr()['label'])>0.3)):
    if not np.isnan(e):
        #print(e)
        print(data.corr()['label'].keys()[i])
        high_corr.append(data.corr()['label'].keys()[i])
        #print(i)
    i = i +1

print(high_corr)

meanfreq
sd
Q25
IQR
sp.ent
sfm
centroid
meanfun
label
['meanfreq', 'sd', 'Q25', 'IQR', 'sp.ent', 'sfm', 'centroid', 'meanfun', 'label']
