In [8]:
import tempfile
from urllib import request
train_file = tempfile.NamedTemporaryFile()
test_file = tempfile.NamedTemporaryFile()
request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)
request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)

('/tmp/tmpp_0b5bvd', <http.client.HTTPMessage at 0x7f8db8095b00>)

In [9]:
import pandas as pd
CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"]
df_train = pd.read_csv(train_file.name, names=CSV_COLUMNS, skipinitialspace=True)
df_test = pd.read_csv(test_file.name, names=CSV_COLUMNS, skipinitialspace=True, skiprows=1)

In [69]:
df_test["income_bracket"].value_counts()

<=50K.    12435
>50K.      3846
Name: income_bracket, dtype: int64

In [57]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
gender            32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
income_bracket    32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [75]:
def input_fn(data_file, num_epochs, shuffle):
  """Input builder function."""
  df_data = pd.read_csv(
      tf.gfile.Open(data_file),
      names=CSV_COLUMNS,
      skipinitialspace=True,
      engine="python",
      skiprows=1)
  # remove NaN elements
  df_data = df_data.dropna(how="any", axis=0)
  labels = df_data["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
  return tf.estimator.inputs.pandas_input_fn(
      x=df_data,
      y=labels,
      batch_size=50,
      num_epochs=num_epochs,
      shuffle=shuffle,
      num_threads=5)

In [60]:
import tensorflow as tf
gender = tf.feature_column.categorical_column_with_vocabulary_list(
    "gender", ["Female", "Male"])

occupation = tf.feature_column.categorical_column_with_hash_bucket(
    "occupation", hash_bucket_size=1000)

education = tf.feature_column.categorical_column_with_vocabulary_list(
    "education", [
        "Bachelors", "HS-grad", "11th", "Masters", "9th",
        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
        "Preschool", "12th"
    ])
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    "marital_status", [
        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
    ])
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    "relationship", [
        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
        "Other-relative"
    ])
workclass = tf.feature_column.categorical_column_with_vocabulary_list(
    "workclass", [
        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
    ])

native_country = tf.feature_column.categorical_column_with_hash_bucket(
    "native_country", hash_bucket_size=1000)

age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education_num")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

age_buckets = tf.feature_column.bucketized_column(
    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

base_columns = [
    gender, native_country, education, occupation, workclass, relationship,
    age_buckets,
]

crossed_columns = [
    tf.feature_column.crossed_column(
        ["education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        ["native_country", "occupation"], hash_bucket_size=1000)
]

model_dir = tempfile.mkdtemp()
m = tf.estimator.LinearClassifier(
    model_dir=model_dir, feature_columns=base_columns + crossed_columns,
    optimizer=tf.train.AdamOptimizer())

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmprfx80h1s', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [None]:
# set num_epochs to None to get infinite stream of data.
m.train(
    input_fn=input_fn(train_file.name, num_epochs=5, shuffle=True))

In [74]:
results = m.evaluate(
    input_fn=input_fn(test_file.name, num_epochs=1, shuffle=False),
    steps=None)
print("model directory = %s" % model_dir)
for key in sorted(results):
  print("%s: %s" % (key, results[key]))

INFO:tensorflow:Starting evaluation at 2017-09-15-09:10:04
INFO:tensorflow:Restoring parameters from /tmp/tmprfx80h1s/model.ckpt-8141
INFO:tensorflow:Finished evaluation at 2017-09-15-09:10:07
INFO:tensorflow:Saving dict for global step 8141: accuracy = 0.836865, accuracy_baseline = 0.763774, auc = 0.884242, auc_precision_recall = 0.698291, average_loss = 0.350483, global_step = 8141, label/mean = 0.236226, loss = 69.9291, prediction/mean = 0.235837
model directory = /tmp/tmprfx80h1s
accuracy: 0.836865
accuracy_baseline: 0.763774
auc: 0.884242
auc_precision_recall: 0.698291
average_loss: 0.350483
global_step: 8141
label/mean: 0.236226
loss: 69.9291
prediction/mean: 0.235837


In [76]:
results = m.evaluate(
    input_fn=input_fn(test_file.name, num_epochs=1, shuffle=False),
    steps=None)
print("model directory = %s" % model_dir)
for key in sorted(results):
  print("%s: %s" % (key, results[key]))

INFO:tensorflow:Starting evaluation at 2017-09-15-09:10:35
INFO:tensorflow:Restoring parameters from /tmp/tmprfx80h1s/model.ckpt-8141
INFO:tensorflow:Finished evaluation at 2017-09-15-09:10:41
INFO:tensorflow:Saving dict for global step 8141: accuracy = 0.836865, accuracy_baseline = 0.763774, auc = 0.884242, auc_precision_recall = 0.698291, average_loss = 0.350482, global_step = 8141, label/mean = 0.236226, loss = 17.5144, prediction/mean = 0.235836
model directory = /tmp/tmprfx80h1s
accuracy: 0.836865
accuracy_baseline: 0.763774
auc: 0.884242
auc_precision_recall: 0.698291
average_loss: 0.350482
global_step: 8141
label/mean: 0.236226
loss: 17.5144
prediction/mean: 0.235836


In [71]:
results

{'accuracy': 0.83686507,
 'accuracy_baseline': 0.76377374,
 'auc': 0.884242,
 'auc_precision_recall': 0.69829094,
 'average_loss': 0.35048237,
 'global_step': 8141,
 'label/mean': 0.23622628,
 'loss': 35.007385,
 'prediction/mean': 0.23583724}