In [1]:
import sklearn
import numpy as np
import pandas as pd
import xgboost as xg
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.model_selection import train_test_split

%matplotlib inline  

In [13]:
equal_no_samples_in_each_class = False
datapath = '/home/tracek/Data/gender/gender_warbler.csv'
seed = 42

params = {'max_depth': 13,
          'n_estimators': 1000,
          'objective': 'binary:logistic',
          'eval_metric': ['auc', 'error'],
          'nthread': 15}

test_fraction = 0.1
val_fraction = 0.2

In [3]:
data = pd.read_csv(datapath).drop(['centroid', 'filename'], axis=1) # centroid corresponds to meanfreq
male_df_len = len(data[data['label'] == 0])
female_df_len = len(data[data['label'] == 1])

In [4]:
if equal_no_samples_in_each_class:
    fraction_to_drop = 1 - female_df_len / male_df_len
    data = data.drop(data[data['label'] == 0].sample(frac=fraction_to_drop, random_state=seed).index)

In [5]:
y = data.pop('label')

In [6]:
if test_fraction > 0.0:
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=test_fraction, random_state=seed, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_fraction, random_state=seed, stratify=y_train)
    dtest = xg.DMatrix(X_test, label=y_test)
else:
    X_train, X_val, y_train, y_val = train_test_split(data, y, test_size=val_fraction, random_state=seed, stratify=y)

In [7]:
dtrain = xg.DMatrix(X_train, label=y_train)
dval = xg.DMatrix(X_val, label=y_val)
evallist = [(dval, 'eval'), (dtrain, 'train')]
model = xg.train(params=params, dtrain=dtrain, num_boost_round=30, evals=evallist)

[0]	eval-error:0.046532	eval-auc:0.969521	train-error:0.022135	train-auc:0.982331
[1]	eval-error:0.044402	eval-auc:0.9768	train-error:0.017425	train-auc:0.989938
[2]	eval-error:0.041539	eval-auc:0.979751	train-error:0.014562	train-auc:0.992642
[3]	eval-error:0.040607	eval-auc:0.980881	train-error:0.012499	train-auc:0.994321
[4]	eval-error:0.039609	eval-auc:0.981809	train-error:0.010635	train-auc:0.995483
[5]	eval-error:0.038743	eval-auc:0.982882	train-error:0.00932	train-auc:0.995953
[6]	eval-error:0.037878	eval-auc:0.983355	train-error:0.008222	train-auc:0.996779
[7]	eval-error:0.037478	eval-auc:0.983466	train-error:0.007223	train-auc:0.997676
[8]	eval-error:0.036813	eval-auc:0.98494	train-error:0.006158	train-auc:0.998514
[9]	eval-error:0.037279	eval-auc:0.985321	train-error:0.005242	train-auc:0.998947
[10]	eval-error:0.037279	eval-auc:0.98551	train-error:0.00451	train-auc:0.999145
[11]	eval-error:0.03668	eval-auc:0.985785	train-error:0.004144	train-auc:0.999377
[12]	eval-error:0.036

In [8]:
y_pred_prob = model.predict(dtest)
y_pred = (y_pred_prob > 0.5).astype(int)

In [9]:
r = metrics.classification_report(y_true=y_test, y_pred=y_pred)
acc = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)

In [12]:
print(r)

             precision    recall  f1-score   support

          0       0.98      0.98      0.98     15765
          1       0.91      0.89      0.90      3013

avg / total       0.97      0.97      0.97     18778

