In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
import pickle

In [2]:
data = pd.read_csv("data/adult-data.csv", names=['age', 'workclass', 'education-num',
                                                 'occupation', 'capital-gain', 'capital-loss',
                                                 'hours-per-week', 'income'])

In [3]:
data.head()

Unnamed: 0,age,workclass,education-num,occupation,capital-gain,capital-loss,hours-per-week,income
0,39,State-gov,13,Adm-clerical,2174,0,40,<=50K
1,50,Self-emp-not-inc,13,Exec-managerial,0,0,13,<=50K
2,38,Private,9,Handlers-cleaners,0,0,40,<=50K
3,53,Private,7,Handlers-cleaners,0,0,40,<=50K
4,28,Private,13,Prof-specialty,0,0,40,<=50K


In [4]:
data[['workclass', 'occupation', 'income']] = data[['workclass', 'occupation', 'income']].apply(lambda x: x.str.strip())

In [5]:
label_dict = defaultdict(LabelEncoder)

In [6]:
data[['workclass', 'occupation', 'income']] = data[['workclass', 'occupation', 'income']].apply(lambda x: label_dict[x.name].fit_transform(x))

In [7]:
def save_obj(var, file):
    with open(file + '.pkl', 'wb') as f:
        pickle.dump(var, f, pickle.HIGHEST_PROTOCOL)
def load_obj(file):
    with open(file + '.pkl', 'rb') as f:
        return pickle.load(f)

In [8]:
save_obj(label_dict, 'income_labels')

In [9]:
X = data.copy()
X.drop("income", inplace = True, axis = 1)
Y = data.income

In [10]:
X_train, X_test = X[:int(X.shape[0]*0.8)].values, X[int(X.shape[0]*0.8):].values
Y_train, Y_test = Y[:int(Y.shape[0]*0.8)].values, Y[int(Y.shape[0]*0.8):].values

In [11]:
train = xgb.DMatrix(X_train, label=Y_train)
test = xgb.DMatrix(X_test, label=Y_test)

In [12]:
param = {'max_depth':7, 'eta':0.1, 'silent':1, 'objective':'binary:hinge'}
num_round = 50
model = xgb.train(param, train, num_round)

In [13]:
preds = model.predict(test)

In [14]:
accuracy = accuracy_score(Y[int(Y.shape[0]*0.8):].values, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 83.66%


In [15]:
model.save_model('income-model.model')