In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier #Read the voice dataset
from sklearn.metrics import f1_score

mydata = pd.read_csv("features.csv")
mydata = mydata[mydata['gender'].isin(['male', 'female'])]

mydata = mydata.drop(['path'], axis=1)
mydata = mydata.dropna()
columns = mydata.columns.tolist()
columns.remove('gender')
mydata = pd.DataFrame(mydata, columns=columns + ['gender'])

mydata_train, mydata_test = train_test_split(mydata, random_state=0, test_size=.2)

scaler1 = StandardScaler()

scaler1.fit(mydata_train.iloc[:,:mydata_train.shape[1]-1])

X_train = scaler1.transform(mydata_train.iloc[:,:mydata_train.shape[1]-1]) 
X_test = scaler1.transform(mydata_test.iloc[:,:mydata_train.shape[1]-1])

y_train = mydata_train['gender'].values
y_test = mydata_test['gender'].values
y_train[y_train == 'female'] = 0
y_train[y_train == 'male'] = 1
y_test[y_test == 'female'] = 0
y_test[y_test == 'male'] = 1

y_train = y_train.astype('int')
y_test = y_test.astype('int')


(19262, 22)
Index(['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'skew', 'kurt',
       'sp.ent', 'sfm', 'mode', 'centroid', 'peakf', 'meanfun', 'minfun',
       'maxfun', 'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx',
       'gender'],
      dtype='object')
(3168, 21)


In [30]:
#Train decision tree model
DT = DecisionTreeClassifier(random_state=0,max_depth=20,criterion="entropy").fit(X_train, y_train)
print("Decision Tree")
print("Accuracy on training set: {:.3f}".format(DT.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(DT.score(X_test, y_test)))
print("F1 score on training set: {:.3f}".format(f1_score(y_train, DT.predict(X_train))))
print("F1 score on test set: {:.3f}".format(f1_score(y_test, DT.predict(X_test))))


Decision Tree
Accuracy on training set: 0.994
Accuracy on test set: 0.815
F1 score on training set: 0.995
F1 score on test set: 0.822


In [38]:

#Train random forest model
forest = RandomForestClassifier(n_estimators=14, random_state=0, max_depth=10, criterion="entropy").fit(X_train, y_train)
print("Random Forests")
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))
print("F1 score on training set: {:.3f}".format(f1_score(y_train, forest.predict(X_train))))
print("F1 score on test set: {:.3f}".format(f1_score(y_test, forest.predict(X_test))))



Random Forests
Accuracy on training set: 0.926
Accuracy on test set: 0.872
F1 score on training set: 0.929
F1 score on test set: 0.879


In [41]:
#Train gradient boosting mode
gbrt = GradientBoostingClassifier(random_state=0,learning_rate=0.1,max_depth=10).fit(X_train, y_train)
print("Gradient Boosting")
print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))
print("F1 score on training set: {:.3f}".format(f1_score(y_train, gbrt.predict(X_train))))
print("F1 score on test set: {:.3f}".format(f1_score(y_test, gbrt.predict(X_test))))



Gradient Boosting
Accuracy on training set: 0.997
Accuracy on test set: 0.873
F1 score on training set: 0.998
F1 score on test set: 0.879


In [43]:
# Train support vector machine model
svm = SVC(C=100).fit(X_train, y_train)
print("Support Vector Machine")
print("Accuracy on training set: {:.3f}".format(svm.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(svm.score(X_test, y_test)))
print("F1 score on training set: {:.3f}".format(f1_score(y_train, svm.predict(X_train))))
print("F1 score on test set: {:.3f}".format(f1_score(y_test, svm.predict(X_test))))



Support Vector Machine
Accuracy on training set: 0.930
Accuracy on test set: 0.868
F1 score on training set: 0.933
F1 score on test set: 0.873


In [46]:
#Train neural network model
mlp = MLPClassifier(random_state=0, max_iter=1000).fit(X_train, y_train)
print("Multilayer Perceptron")
print("Accuracy on training set: {:.3f}".format(mlp.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(mlp.score(X_test, y_test)))
print("F1 score on training set: {:.3f}".format(f1_score(y_train, mlp.predict(X_train))))
print("F1 score on test set: {:.3f}".format(f1_score(y_test, mlp.predict(X_test))))



Multilayer Perceptron
Accuracy on training set: 0.904
Accuracy on test set: 0.875
F1 score on training set: 0.908
F1 score on test set: 0.880
