# Random Forest with Grid Search (XGBoost)

In [1]:
import os
import itertools

import joblib

import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import metrics

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

## Data Loading

In [2]:
DATA_DIR = os.path.join("..", "data", "census")
DATA_FILE = "cleaned-census-data.csv"

In [3]:
df = pd.read_csv(os.path.join(DATA_DIR, DATA_FILE), delimiter=',')

df.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,...,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,>50K
0,39,2174,0,40,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,50,0,0,13,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,38,0,0,40,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,53,0,0,40,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,28,0,0,40,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [4]:
features_df = df.drop('>50K', axis='columns')
labels_df = df['>50K']

## Data Splitting

In [5]:
X = features_df.values
y = labels_df.values

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.2)

dtrain = xgb.DMatrix(X_train, y_train)
dval = xgb.DMatrix(X_val, y_val)
dtest = xgb.DMatrix(X_test, y_test)

## Training

In [6]:
hyperparams = {
    'eta': 0.5,
    'max_depth': 7,
}
print(hyperparams)
hyperparams['objective'] = "binary:logistic"
hyperparams['eval_metric'] = ['error']

num_rounds = 20
eval_list = [(dtrain, 'train'), (dval, 'val')]

{'eta': 0.5, 'max_depth': 7}


In [7]:
bst = xgb.train(hyperparams, dtrain, num_rounds, eval_list)

[11:26:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 120 extra nodes, 0 pruned nodes, max_depth=7
[0]	train-error:0.163442	val-error:0.171328
[11:26:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=7
[1]	train-error:0.161032	val-error:0.167391
[11:26:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 102 extra nodes, 0 pruned nodes, max_depth=7
[2]	train-error:0.160964	val-error:0.167391
[11:26:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 130 extra nodes, 0 pruned nodes, max_depth=7
[3]	train-error:0.156042	val-error:0.162096
[11:26:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 128 extra nodes, 0 pruned nodes, max_depth=7
[4]	train-error:0.154786	val-error:0.16101
[11:26:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=7
[5]	train-error:0.154141	val-error:0.161689
[11:26:14] src/tree/updater_prune.cc:74: tree pruning end, 

## Testing

In [8]:
print(f"Training accuracy: {metrics.accuracy_score(dtrain.get_label(), bst.predict(dtrain).round())}")
print(f"Testing accuracy: {metrics.accuracy_score(dtest.get_label(), bst.predict(dtest).round())}")

Training accuracy: 0.8629327902240326
Testing accuracy: 0.8535896600412729


In [9]:
print(f"Training F-score: {metrics.f1_score(dtrain.get_label(), bst.predict(dtrain).round())}")
print(f"Testing F-score: {metrics.f1_score(dtest.get_label(), bst.predict(dtest).round())}")

Training F-score: 0.6786055396370583
Testing F-score: 0.6575203252032521


## Save Model

In [10]:
joblib.dump(bst, os.path.join("..", "output", "xgboost.gz"))

['../output/xgboost.gz']