In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df = pd.read_csv("/kaggle/input/voicegender/voice.csv")
df.head()

The Dataset
The following acoustic properties of each voice are measured and included within the CSV:

* meanfreq: mean frequency (in kHz)
* sd: standard deviation of frequency
* median: median frequency (in kHz)
* Q25: first quantile (in kHz)
* Q75: third quantile (in kHz)
* IQR: interquantile range (in kHz)
* skew: skewness (see note in specprop description)
* kurt: kurtosis (see note in specprop description)
* sp.ent: spectral entropy
* sfm: spectral flatness
* mode: mode frequency
* centroid: frequency centroid (see specprop)
* meanfun: average of fundamental frequency measured across acoustic signal
* minfun: minimum fundamental frequency measured across acoustic signal
* maxfun: maximum fundamental frequency measured across acoustic signal
* meandom: average of dominant frequency measured across acoustic signal
* mindom: minimum of dominant frequency measured across acoustic signal
* maxdom: maximum of dominant frequency measured across acoustic signal
* dfrange: range of dominant frequency measured across acoustic signal
* modindx: modulation index. Calculated as the accumulated absolute difference between adjacent measurements of fundamental frequencies divided by the frequency range
* label: male or female

In [None]:
df.describe().T

Let's check if we have null data.

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
plt.figure(figsize=(16,10), dpi=200)
sns.heatmap(df.corr(), annot=True, cmap="mako", fmt='.2f', linewidths=0.5);

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x="dfrange", y="meanfreq", hue="label");

#dfrange: range of dominant frequency measured across acoustic signal
#meanfreq: mean frequency (in kHz)

In [None]:
#sp.ent: spectral entropy

plt.figure(figsize=(10,6))
sns.histplot(data=df,x="sp.ent", hue="label", color="orange");

In [None]:
#sfm: spectral flatness

plt.figure(figsize=(10,6))
sns.histplot(data=df,x="sfm", hue="label");

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop("label", axis=1) 
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=47)

# Support Vector Machines

In [None]:
from sklearn.svm import SVC
svc = SVC()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [None]:
svc.get_params().keys()

In [None]:
param_grid = {"C": [0,0.1,1,10,100,1000],
              "degree":[2,3,4,5],
             "gamma": ["scale","auto"],
             "kernel": ['linear', 'poly', 'rbf']
             }

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(svc,param_grid, cv=5, scoring="accuracy")
grid.fit(scaled_X_train, y_train)

In [None]:
grid.best_params_

In [None]:
y_preds = grid.predict(scaled_X_test)

In [None]:
print(classification_report(y_test, y_preds))

In [None]:
cm = confusion_matrix(y_test, y_preds, labels=grid.classes_)
cm1 = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=grid.classes_)
cm1.plot()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
acc = round(accuracy_score(y_test, y_preds) * 100, 3)
acc

# K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [None]:
knn.get_params().keys()

In [None]:
operations = [("scaler",scaler), ("knn",knn)]

In [None]:
from sklearn.pipeline import Pipeline
pipe = Pipeline(operations)

In [None]:
k_values = list(range(1,30))
param_grid = {"knn__n_neighbors":  k_values}

In [None]:
grid = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy")
grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_.get_params()

We choose knn__n_neighbors as 3 as a result.

In [None]:
knn7 = KNeighborsClassifier(n_neighbors=7)
operations = [('scaler',scaler),('knn7',knn7)]

In [None]:
pipe = Pipeline(operations)
pipe.fit(X_train,y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=pipe.classes_)
cm1 = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=pipe.classes_)
cm1.plot()
plt.show()

In [None]:
acc2 = round(accuracy_score(y_test, y_pred) * 100, 3)
acc2

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
y_pred1 = rfc.predict(X_test)
print(classification_report(y_test, y_pred1))

In [None]:
cm = confusion_matrix(y_test, y_pred1, labels=rfc.classes_)
cm1 = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=rfc.classes_)
cm1.plot()
plt.show()

In [None]:
acc3 = round(accuracy_score(y_test, y_pred1) * 100, 3)
acc3

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred2 = dt.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred2, labels=dt.classes_)
cm1 = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=dt.classes_)
cm1.plot()
plt.show()

In [None]:
acc4 = round(accuracy_score(y_test, y_pred2) * 100,3)
acc4

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 
              'Random Forest','Decision Tree'],
    'Score': [acc, acc2, acc3, acc4]})

models.sort_values(by='Score', ascending=False)