In [6]:
from __future__ import absolute_import, division, print_function
import numpy as np
import tensorflow as tf
%matplotlib inline
import matplotlib 
import matplotlib.pyplot as plt
import pandas as pd
from six.moves import urllib
import shutil

In [2]:
print(tf.__version__)
print(np.__version__)
print(matplotlib.__version__)

1.14.0
1.16.4
3.1.1


In [8]:
# data source https://archive.ics.uci.edu/ml/index.php
TRAIN_FILE_NAME = "cencus/adult.data"
TEST_FILE_NAME = "cencus/adult.test"
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
                          TRAIN_FILE_NAME)
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
                          TEST_FILE_NAME)

('cencus/adult.test', <http.client.HTTPMessage at 0x7f31db14e910>)

In [9]:
CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender", 
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"
]

df = pd.read_csv(TRAIN_FILE_NAME, names=CSV_COLUMNS, skipinitialspace=True, skiprows=1)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [10]:
TRIMMED_REQUIRED_COLUMNS = [
    "age", "workclass", "education_num", "marital_status",
    "relationship", "race", "gender", "occupation",
    "hours_per_week", "native_country", "income_bracket"
]

df = df[TRIMMED_REQUIRED_COLUMNS]

In [12]:
# Estimators
# get the unique features
gender = tf.feature_column.categorical_column_with_vocabulary_list("gender", df['gender'].unique())
race = tf.feature_column.categorical_column_with_vocabulary_list("race", df['race'].unique())
education = tf.feature_column.categorical_column_with_vocabulary_list("education", df['education_num'].unique())
marital_status = tf.feature_column.categorical_column_with_vocabulary_list("marital_status", df['marital_status'].unique())
relationship = tf.feature_column.categorical_column_with_vocabulary_list("relationship", df['relationship'].unique())
workclass = tf.feature_column.categorical_column_with_vocabulary_list("workclass", df['workclass'].unique())

age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education_num")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

age_buckets = tf.feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

occupation = tf.feature_column.categorical_column_with_hash_bucket("occupation", hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket("native_country", hash_bucket_size=1000)

base_columns = [gender, race, marital_status, workclass, occupation, native_country, age_buckets, education]

crossed_columns = [
    tf.feature_column.crossed_column(
            ["education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
            [age_buckets, "occupation", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
            ["native_country", "occupation"], hash_bucket_size=1000),
]

# Columns with continous numeric values
deep_columns = [education_num, hours_per_week]

In [25]:
def input_fn(file_name, num_epochs, shuffle):
    df = pd.read_csv(file_name, names=CSV_COLUMNS, skipinitialspace=True, skiprows=1)
    df = df[TRIMMED_REQUIRED_COLUMNS]
    df = df.dropna(how="any", axis=0)
    labels = df["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
    
    return tf.estimator.inputs.pandas_input_fn(x=df, y=labels, 
                                               batch_size=100, num_epochs=num_epochs,
                                              shuffle=shuffle,
                                              num_threads=5)

MODEL_DIR = "./linear_classifier"
shutil.rmtree(MODEL_DIR)
linear_estimator = tf.estimator.LinearClassifier(
    model_dir=MODEL_DIR, feature_columns=base_columns + crossed_columns + deep_columns)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './linear_classifier', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f31dacbe6d0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [26]:
linear_estimator.train(input_fn=input_fn(TRAIN_FILE_NAME, num_epochs=None, shuffle=True), steps=1000)

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


ValueError: in converted code:
    relative to /home/jp/duke/advancedpython/advpy/lib/python3.7/site-packages/tensorflow/python/feature_column:

    feature_column_v2.py:704 call
        return self.layer(features)
    feature_column_v2.py:565 call
        weight_var=weight_var)
    feature_column_v2.py:2359 _create_weighted_sum
        weight_var=weight_var)
    feature_column_v2.py:2446 _create_categorical_column_weighted_sum
        state_manager)
    feature_column_v2.py:3710 get_sparse_tensors
        transformation_cache.get(self, state_manager), None)
    feature_column_v2.py:2562 get
        transformed = column.transform_feature(self, state_manager)
    feature_column_v2.py:3687 transform_feature
        transformation_cache.get(self.key, state_manager))
    feature_column_v2.py:2554 get
        raise ValueError('Feature {} is not in features dictionary.'.format(key))

    ValueError: Feature education is not in features dictionary.
