# Kaggle [Gender Recognition by Voice]
- https://www.kaggle.com/primaryobjects/voicegender

## 1. 데이터 로드 및 분할

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from pprint import pprint

In [3]:
df = pd.read_csv('data/voice.csv')
X = df.iloc[:,0:-1]
y = df.iloc[:,-1]

함수 `train_val_test_split`을 정의: 데이터를 학습 셋 (training set), 검증 셋 (validation set), 테스트 셋 (test set) 3개로 분할

In [4]:
def train_val_test_split(X, y, val_size=0.3, test_size=0.2, random_state=123):
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=test_size, 
                                                        random_state=random_state)
    val_size_rev = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                      test_size=val_size_rev,
                                                      random_state=random_state)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [8]:
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, y,
                                                                      val_size=0.3,
                                                                      test_size=0.2,
                                                                      random_state=123)

In [9]:
# Training 50%, validation 30%, test 20%
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(1583, 20)
(951, 20)
(634, 20)


## 2. 모델 학습 및 검증

### 2.1. Decision tree
- 나무 깊이에 대해 다른 파라미터값 부여
- 각각의 파라미터에 대해 Train set으로 모델을 생성한 후, Validation set으로 성능 평가 ==> 가장 성능이 좋은 파라미터와 모델 선택

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

depth_set = [3, 4, 5, 6, 7, 8, 9, 10]
dt_models = []
accuracy_set = []
cm_set = []
train_accuracy_set = []

for depth in depth_set:
    model = DecisionTreeClassifier(max_depth = depth, random_state = 1)
    model.fit(X_train, y_train)
    y_train_hat = model.predict(X_train)
    y_val_hat = model.predict(X_val)
    train_accuracy = metrics.accuracy_score(y_train, 
                                            y_train_hat)
    accuracy = metrics.accuracy_score(y_val, y_val_hat)
    cm = metrics.confusion_matrix(y_val, y_val_hat)
    
    dt_models.append(model)
    accuracy_set.append(accuracy)
    train_accuracy_set.append(train_accuracy)
    cm_set.append(cm)

In [13]:
# 파라미터 탐색 결과, 가장 좋은 모델과 Validation set에 대한 정확도
max_value = max(accuracy_set)
max_index = accuracy_set.index(max_value)
print(max_index)
print(max_value)

4
0.9684542586750788


In [16]:
# 가장 좋은 모델
best_dt = dt_models[max_index]
print(best_dt)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')


### 2.2. Random Forest
- 나무의 갯수, 각 나무의 변수 선택 수를 파라미터로 설정
- 각각의 파라미터 집합에 대해 Train set으로 모델을 생성한 후, Validation set으로 성능 평가 ==> 가장 성능이 좋은 파라미터와 모델 선택

In [17]:
from sklearn.ensemble import RandomForestClassifier

n_estimators_set = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
max_features_set = ['auto', 'log2']

rf_models = []
accuracy_set = []
cm_set = []

for n_estimators in n_estimators_set:
    for max_features in max_features_set:
        rf = RandomForestClassifier(n_estimators = n_estimators,
                                    max_features = max_features,
                                    random_state = 123)
        rf.fit(X_train, y_train)
        y_val_hat = rf.predict(X_val)
        accuracy = metrics.accuracy_score(y_val, y_val_hat)
        cm = metrics.confusion_matrix(y_val, y_val_hat)

        rf_models.append(rf)
        accuracy_set.append(accuracy)
        cm_set.append(cm)

In [18]:
# 파라미터 탐색 결과, 가장 좋은 모델과 Validation set에 대한 정확도
max_value = max(accuracy_set)
max_index = accuracy_set.index(max_value)
print(max_index)
print(max_value)

4
0.9842271293375394


In [19]:
# 가장 좋은 모델
best_rf = rf_models[max_index]
print(best_rf)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False)


### 2.3. Logistic Regression
- 페널티 유형(penalty), 페널티 정도(C), 클래스 가중치 유형(class_weight) 등에 대해 탐색

In [20]:
from sklearn.linear_model import LogisticRegression

# Hyper-parameter caldidates
penalty_set = ['l1', 'l2']
C_set = [0.01, 0.1, 1, 10, 100]
class_weight_set = [None, 'balanced']

# 결과 저장을 미리 할당하기 위한 리스트 선언
train_acc_set = []
val_acc_set = []
lr_models = []

for penalty in penalty_set:
    for C in C_set:
        for class_weight in class_weight_set:
            lr = LogisticRegression(penalty=penalty, C=C, 
                                    class_weight=class_weight,
                                    random_state=2072)
            # Train the model
            lr.fit(X_train, y_train)
            lr_models.append(lr)
            
            # Calculate training accuracy and validation accuracy
            y_train_hat = lr.predict(X_train)
            y_val_hat = lr.predict(X_val)
            train_acc = metrics.accuracy_score(y_train, y_train_hat)
            val_acc = metrics.accuracy_score(y_val, y_val_hat)
            train_acc_set.append(train_acc)
            val_acc_set.append(val_acc)
            

In [21]:
# 파라미터 탐색 결과, 가장 좋은 모델과 Validation set에 대한 정확도
max_value = max(val_acc_set)
max_index = val_acc_set.index(max_value)
print(max_index)
print(max_value)

6
0.9779179810725552


In [22]:
# 가장 좋은 모델
best_lr = lr_models[max_index]
print(best_lr)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=2072, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


### 2.4. k-Nearest Neighbors Classifier
- 이웃 수(n_neighbors), 가중치 부여 방법(weights)을 컨트롤

In [23]:
from sklearn.neighbors import KNeighborsClassifier

# Hyper-parameter caldidates
n_neighbors_set = [10, 15, 20, 25, 30]
weights_set = ['uniform', 'distance']

# 결과 저장을 미리 할당하기 위한 리스트 선언
train_acc_set = []
val_acc_set = []
knn_models = []

for n_neighbors in n_neighbors_set:
    for weights in weights_set:
        knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=-1)
        
        # Train the model
        knn.fit(X_train, y_train)
        knn_models.append(knn)

        # Calculate training accuracy and validation accuracy
        y_train_hat = knn.predict(X_train)
        y_val_hat = knn.predict(X_val)
        train_acc = metrics.accuracy_score(y_train, y_train_hat)
        val_acc = metrics.accuracy_score(y_val, y_val_hat)
        train_acc_set.append(train_acc)
        val_acc_set.append(val_acc)

In [24]:
# 파라미터 탐색 결과, 가장 좋은 모델과 Validation set에 대한 정확도
max_value = max(val_acc_set)
max_index = val_acc_set.index(max_value)
print(max_index)
print(max_value)

3
0.6929547844374343


In [26]:
# 가장 좋은 모델
best_knn = knn_models[max_index]
print(best_knn)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=15, p=2,
           weights='distance')


### 2.5. Naive Bayes Classifier
- Naive Bayes Classifier는 딱히 컨트롤할 하이퍼파라미터가 없음

In [27]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

# Train the model
nb.fit(X_train, y_train)

# Calculate training accuracy and validation accuracy
y_train_hat = nb.predict(X_train)
y_val_hat = nb.predict(X_val)
train_acc = metrics.accuracy_score(y_train, y_train_hat)
val_acc = metrics.accuracy_score(y_val, y_val_hat)


In [28]:
print(val_acc)

0.8633017875920084


In [29]:
best_nb = nb
print(nb)

GaussianNB(priors=None)


## 3. 테스트 셋을 이용하여 가장 성능이 좋은 모델 찾기
- 앞서 학습한 Decision Tree, Random Forest, Logistic Regression, k-Nearest Neighbors Classifier, Naive Bayes Classifier 별로 가장 검증 데이터셋에 정확도가 좋았던 모델들을 모은 후
- 테스트 셋에 대한 정확도가 가장 좋은 모델을 찾기

In [34]:
best_models = [best_dt, best_rf, best_lr, best_knn, best_nb]
pprint(best_models)

[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best'),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False),
 LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=2072, solver='liblinear', tol=0.0001,
          verbose=

In [37]:
# Trainining set과 Validation set을 합친 후 Test set에 대해 예측 성능 평가
X_concat = pd.concat([X_train, X_val])
y_concat = pd.concat([y_train, y_val])

for best_model in best_models:
    
    # 합친 데이터에 모델을 refit
    best_model.fit(X_concat, y_concat)
    
    # Test set에 대해 예측 성능 평가
    y_test_hat = best_model.predict(X_test)
    
    print(best_model)
    print(metrics.accuracy_score(y_test, y_test_hat))
    print(metrics.confusion_matrix(y_test, y_test_hat))
    print('='*80)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')
0.9621451104100947
[[310  18]
 [  6 300]]
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False)
0.9763406940063092
[[317  11]
 [  4 302]]
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          pena