In [1]:
import time 
import numpy as np
import tensorflow as tf
import pandas as pd


In [7]:
categorical_columns = ['workclass','education',
                      'marital.status','occupation',
                      'relationship','race','gender',
                      'native-country']

# Columns of input csv

columns = ['age','workclass','fnlwgt','education',
          'educational-num','marital-status','occupation',
          'relationship','race','gender','capital-gain','capital-loss',
          'hours-per-week','native-country','income']

feature_column = ['age','workclass','fnlwgt','education',
          'educational-num','marital-status','occupation',
          'relationship','race','gender','capital-gain','capital-loss',
          'hours-per-week','native-country']

In [8]:
df = pd.read_csv('adult.csv')
df.head()
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [9]:
from sklearn.model_selection import train_test_split
batch_size = 40
num_epochs = 1
shuffle = True
print(df['income'].shape)
y= df['income'].apply(lambda x: ">50K" in x).astype(int) #binary response variable

del df['fnlwgt']
del df['income']
X=df
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

(48842,)


In [10]:
train_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(x=X_train,
                                                              y=y_train,
                                                              batch_size=batch_size,
                                                              num_epochs=num_epochs,
                                                              shuffle=shuffle)

test_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(x=X_test,
                                                              y=y_test,
                                                              batch_size=batch_size,
                                                              num_epochs=num_epochs,
                                                              shuffle=shuffle)

In [11]:
def generate_input_fn(filename,num_epochs=None,shuffle=True,batch_size=batch_size):
    df = pd.read_csv(filename)
    labels = df['income'].apply(lambda x: ">50K" in x).astype(int)
    del df['fnlwgt']
    del df['income']
    type(df['age'].iloc[3])
    
    return tf.compat.v1.estimator.inputs.pandas_input_fn(
    x=df,
    y=labels,
    batch_size=batch_size,
    num_epochs=num_epochs,
    shuffle=shuffle)

In [63]:
#Creating the feature columns

In [12]:
gender = tf.feature_column.categorical_column_with_vocabulary_list(key="gender",
                                                               vocabulary_list=['female','male'])
race = tf.feature_column.categorical_column_with_vocabulary_list(key="race",
                                                                vocabulary_list=['Amer-Indian-Eskimo',
                                                                                'Asian-Pac-Islander',
                                                                                'Black','Other','White'])

education = tf.feature_column.categorical_column_with_hash_bucket('education',hash_bucket_size=1000)
marital_status = tf.feature_column.categorical_column_with_hash_bucket('marital-status',hash_bucket_size=100)
relationship = tf.feature_column.categorical_column_with_hash_bucket('relationship',hash_bucket_size=100)
work_class = tf.feature_column.categorical_column_with_hash_bucket('workclass',hash_bucket_size=100)
occupation = tf.feature_column.categorical_column_with_hash_bucket('occupation',hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket('native-country',hash_bucket_size=1000)

In [13]:
age = tf.feature_column.numeric_column('age')
education_num = tf.feature_column.numeric_column('educational-num')
capital_gain = tf.feature_column.numeric_column('capital-gain')
capital_loss = tf.feature_column.numeric_column('capital-loss')
hours_per_week = tf.feature_column.numeric_column('hours-per-week')

In [66]:
# The Wide columns are the sparse , categorical columns that we specified as well our hashed bicket feature

In [14]:
wide_columns = [gender,race,native_country,education,occupation,work_class,marital_status,relationship]

In [16]:
deep_columns = [
    #Multi -hot indicator column for column with fewer possibilities 
    tf.feature_column.indicator_column(work_class),
    tf.feature_column.indicator_column(marital_status),
    tf.feature_column.indicator_column(gender),
    tf.feature_column.indicator_column(relationship),
    tf.feature_column.indicator_column(race),
    # Embeding for categories with more possibilites,
    # should have atleast (possibilty)*0.25 dimension
    tf.feature_column.embedding_column(education,dimension=8),
    tf.feature_column.embedding_column(native_country,dimension=8),
    tf.feature_column.embedding_column(occupation,dimension=8),
    #numerical column
    age,
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week,
    




]

### Wide : Linear CLassifier
### Deep : Deep Neural Classifier
### Wide and Deep : Combined Hybrid


In [18]:
def create_model_dir(model_type):
    return "model/model_" + model_type + "_" + str(int(time.time()))

# if new model is false pass in the desired model_dir

def get_model(model_type,wide_columns = None, deep_columns = None, new_model = False , model_dir=None):
    if new_model or model_dir is None:
        model_dir = create_model_dir(model_type) # Comment this line to traine existing model
    print('Model Directory {}'.format(model_dir))
    
    m= None
    #Linear Classifier
    if model_type == 'WIDE':
        m=tf.estimator.LinearClassifier(
        model_dir=model_dir,
        feature_columns = wide_columns)
    
    if model_type == 'DEEP':
        m=tf.estimator.DNNClassifier(
        model_dir=model_dir,
        feature_columns=deep_columns,
        hidden_units=[100,50])
    
    #Combine
    
    if model_type=="WIDE_AND_DEEP":
        m = tf.estimator.DNNLinearCombinedClassifier(
        model_dir=model_dir,
        linear_feature_columns = wide_columns,
        dnn_feature_columns = deep_columns,
        dnn_hidden_units = [100,70,50,25])
    print("Estimator Generated")
    return m,model_dir

In [33]:
MODEL_TYPE = 'WIDE'
model_dir = create_model_dir(MODEL_TYPE)
m,model_dir = get_model(MODEL_TYPE,wide_columns,deep_columns,model_dir)

Model Directory model/model_WIDE_1595182148
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'model/model_WIDE_1595182148', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Estimator Generated


In [34]:
m.train(input_fn=train_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into model/model_WIDE_1595182148/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 0.6931472, step = 0
INFO:tensorflow:global_step/sec: 189.839
INFO:tensorflow:loss = 0.3775067, step = 100 (0.528 sec)
INFO:tensorflow:global_step/sec: 330.704
INFO:tensorflow:loss = 0.45498672, step = 200 (0.302 sec)
INFO:tensorflow:global_step/sec: 318.537
INFO:tensorflow:loss = 0.44410878, step = 300 (0.314 sec)
INFO:tensorflow:global_step/sec: 315.76
INFO:tensorflow:loss = 0.3379423, step = 400 (0.317 sec)
INFO:tensorflow:global_step/sec: 348.237
INFO:tensorflow:loss = 0.31566912, step = 500 (0.28

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x7f96a2e1ab20>

In [35]:
result = m.evaluate(input_fn=test_input_fn)
print("Accuracy = {}".format(result['accuracy']))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-07-19T23:09:15Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from model/model_WIDE_1595182148/model.ckpt-977
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 1.54482s
INFO:tensorflow:Finished evaluation at 2020-07-19-23:09:17
INFO:tensorflow:Saving dict for global step 977: accuracy = 0.82731086, accuracy_baseline = 0.75882894, auc = 0.8666224, auc_precision_recall = 0.6771954, average_loss = 0.37474066, global_step = 977, label/mean = 0.24117105, loss = 0.37516317, precision = 0.68511343, prediction/mean = 0.24452089, recall = 0.5254669
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 977: model/model_WIDE_1595182148/model.ckpt-977
Accuracy = 0.8273108601570129
