# Step 01 - Import Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

## Step 02 - Data Pre-Proccessing

In [None]:
df = pd.read_csv('./gender_voice_dataset.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['label'].value_counts()

In [None]:
df.replace({'label':{'male':1, 'female':0}}, inplace=True)

In [None]:
x = df.drop('label', axis=1).values
y = df['label'].values

In [None]:
scaler = StandardScaler().fit(x)
x = scaler.transform(x)

In [None]:
x_train_full, x_test, y_train_full, y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [None]:
x_train_full.shape, y_train_full.shape

In [None]:
x_train = x_train_full[:500]
x_val = x_train_full[500:]

y_train = y_train_full[:500]
y_val = y_train_full[500:]

## building the model

In [None]:
from tensorflow import keras

In [None]:
model = keras.Sequential()

In [None]:
model.add(keras.layers.Dense(units=128, activation='relu', input_shape=(20,)))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.2))

model.add(keras.layers.Dense(units=64, activation='relu'))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.4))

model.add(keras.layers.Dense(units=32, activation='relu'))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.5))

model.add(keras.layers.Dense(units=1, activation='sigmoid'))


In [None]:
# compile the model

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
#  training the model

history = model.fit(x_train, y_train, epochs=20, validation_data=(x_val, y_val), verbose=2)

In [None]:
y_pred = model.predict(x_test)

In [None]:
pred = np.where(y_pred > 0.5, 1, 0)

In [None]:
print(metrics.accuracy_score(y_test, pred))

## Building the sklearn model

In [None]:
models = [RandomForestClassifier, DecisionTreeClassifier, LogisticRegression, SVC]

In [None]:
def modelScore(x):
  model = x()
  model.fit(x_train, y_train)
  
  print(type(model).__name__)
  print(model.score(x_train, y_train), "Training score")
  y_pred = model.predict(x_test)
  print(metrics.accuracy_score(y_test, y_pred))

In [None]:
for i in models:
  modelScore(i)
  print('--'*20)
