### basic implementation of tensorflow estimator

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import tempfile
import tensorflow as tf

In [0]:
COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
           "marital_status", "occupation", "relationship", "race", "gender",
           "capital_gain", "capital_loss", "hours_per_week", "native_country",
           "income_bracket"]

train_df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    names=COLUMNS,
    sep=r'\s*,\s*',
    engine="python",
    na_values="?")

test_df = pd.read_csv(
  "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
    names=COLUMNS,
    sep=r'\s*,\s*',
    engine="python",
    na_values="?",
    skiprows=[0])

In [0]:
train_df = train_df.dropna(how="any",axis=0)
test_df = test_df.dropna(how="any",axis=0)

define input function

In [0]:
def input_fn(data, batch_size=100, num_epochs=1, shuffle=False):
  return tf.estimator.inputs.pandas_input_fn(
      x=data.drop('income_bracket', axis=1),
      y=data['income_bracket'].apply(lambda x: ">50K" in x).astype(int),
      batch_size=batch_size,#number of times we update internal parameters
      num_epochs=num_epochs,#number of times total dataset should be processed, if number of epochs is 3 then total dataset is processed 3 times
      shuffle=shuffle,
      num_threads=1)

"""
for example- 
if dataset contains 1000 rows, with batch size of 100 and number of epochs is 5. 
then we pass data of batch size 50 times to train the model and update parameters each time.

number of times we pass data to model = ((dataset size)/(batch size))*(number of epochs)
"""

define feature columns

numerical columns are - "capital_gain", "capital_loss", "hours_per_week",  "age", "fnlwgt", "education_num"

In [0]:
age = tf.feature_column.numeric_column("age")
fnlwgt = tf.feature_column.numeric_column("fnlwgt")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")
age = tf.feature_column.numeric_column("age")

categorical coluumns - "workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country"

In [0]:
workclass = tf.feature_column.categorical_column_with_vocabulary_list("workclass",['Federal-gov', 'Local-gov', 'Private', 'Self-emp-inc','Self-emp-not-inc', 'State-gov', 'Without-pay'])

education = tf.feature_column.categorical_column_with_vocabulary_list("education",
                      ['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th','Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad',
                       'Masters', 'Preschool', 'Prof-school', 'Some-college'])

marital_status = tf.feature_column.categorical_column_with_vocabulary_list("marital_status",
                       ['Divorced', 'Married-AF-spouse', 'Married-civ-spouse','Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'])


occupation = tf.feature_column.categorical_column_with_hash_bucket("occupation", hash_bucket_size=1000)

relationship = tf.feature_column.categorical_column_with_vocabulary_list("relationship",
                        ['Husband', 'Not-in-family', 'Other-relative', 'Own-child','Unmarried', 'Wife'])

race = tf.feature_column.categorical_column_with_vocabulary_list("race",['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other','White'])

gender = tf.feature_column.categorical_column_with_vocabulary_list("gender",['Female', 'Male'])

native_country = tf.feature_column.categorical_column_with_hash_bucket("native_country",hash_bucket_size=1000)


transforming age from numerical to categorical using buckets

In [0]:
age_buckets = tf.feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

define which columns to use for processing and at the same time converting categorical variables to numerical using one-hot encoding and n-hot encoding

In [0]:
deep_columns = [
    tf.feature_column.indicator_column(workclass),
    tf.feature_column.indicator_column(education),
    tf.feature_column.indicator_column(age_buckets),
    tf.feature_column.indicator_column(gender),
    tf.feature_column.indicator_column(relationship),
    tf.feature_column.embedding_column(native_country,dimension=8),
    tf.feature_column.embedding_column(occupation,dimension=8)
]

In [10]:
print(deep_columns)

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='workclass', vocabulary_list=('Federal-gov', 'Local-gov', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='education', vocabulary_list=('10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Preschool', 'Prof-school', 'Some-college'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=BucketizedColumn(source_column=NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65))), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='gender', vocabulary_list=('Female', 'Male'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), Indicat

define neural network model

In [13]:
HIDDEN_UNITS = [1024, 512] #@param
LEARNING_RATE = 0.1 #@param
L1_REGULARIZATION_STRENGTH = 0.0001 #@param
L2_REGULARIZATION_STRENGTH = 0.0001 #@param

model_dir = tempfile.mkdtemp()
model = tf.estimator.DNNClassifier(
    feature_columns=deep_columns,
    hidden_units=HIDDEN_UNITS,
    optimizer=tf.train.ProximalAdagradOptimizer(
      learning_rate=LEARNING_RATE,
      l1_regularization_strength=L1_REGULARIZATION_STRENGTH,
      l2_regularization_strength=L2_REGULARIZATION_STRENGTH),
    model_dir=model_dir)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpvi02mbmj', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc59182e4a8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


train the model

In [15]:
STEPS = 1000 #@param

model.train(
    input_fn=input_fn(train_df, num_epochs=None, shuffle=True),
    steps=STEPS);

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
The old _FeatureColumn APIs are being 

evaluate the results

In [17]:
results = model.evaluate(
    input_fn=input_fn(test_df, num_epochs=1, shuffle=False),
    steps=None)
print("model directory = %s" % model_dir)
print("---- Results ----")
for key in sorted(results):
  print("%s: %s" % (key, results[key]))

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-27T17:52:01Z
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from /tmp/tmpvi02mbmj/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-09-27-17:52:02
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.8322709, accuracy_baseline = 0.7543161, auc = 0.88452756, auc_precision_recall = 0.70659107, average_loss = 0.35925642, global_step = 1000, label/mean = 0.24568394, loss = 35.830475, precision = 0.6706395, prediction/mean = 0.25969675, recall = 0.6235135
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: /tmp/tmpvi02mbmj/model.ckpt-1000
model directory = /tmp/tmp