In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
import pickle

In [2]:
data = pd.read_csv("data/adult-data.csv", names=['age', 'workclass', 'fnlwgt',
                                                 'education', 'education-num', 'marital-status', 
                                                 'occupation', 'relationship', 'race', 'sex',
                                                 'capital-gain', 'capital-loss',
                                                 'hours-per-week', 'native-country', 'income'])

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
data[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']] = data[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']].apply(lambda x: x.str.strip())

In [6]:
label_dict = defaultdict(LabelEncoder)

In [7]:
data[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']] = data[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']].apply(lambda x: label_dict[x.name].fit_transform(x))

In [8]:
def save_obj(var, file):
    with open(file + '.pkl', 'wb') as f:
        pickle.dump(var, f, pickle.HIGHEST_PROTOCOL)
def load_obj(file):
    with open(file + '.pkl', 'rb') as f:
        return pickle.load(f)

In [9]:
save_obj(label_dict, 'income_labels')

In [14]:
X = data.copy()
X.drop("income", inplace = True, axis = 1)
Y = data.income

In [15]:
X_train, X_test = X[:int(X.shape[0]*0.8)].values, X[int(X.shape[0]*0.8):].values
Y_train, Y_test = Y[:int(Y.shape[0]*0.8)].values, Y[int(Y.shape[0]*0.8):].values

In [17]:
train = xgb.DMatrix(X_train, label=Y_train)
test = xgb.DMatrix(X_test, label=Y_test)

In [18]:
param = {'max_depth':7, 'eta':0.1, 'silent':1, 'objective':'binary:hinge'}
num_round = 50
model = xgb.train(param, train, num_round)

In [19]:
preds = model.predict(test)

In [20]:
accuracy = accuracy_score(Y[int(Y.shape[0]*0.8):].values, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 86.32%


In [21]:
model.save_model('income-model.model')