In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import confusion_matrix,accuracy_score

In [2]:
headers = ['age','workclass','fnlwgt','education','edu_num','marital_status',
           'occupation','relationship','race','sex','cap_gain','cap_loss',
           'work_hrs_weekly','country','income']
try:
    print('Getting Data...')
    df_train_raw = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',names=headers)
    df_test_raw = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',names=headers)
    print('Data Loaded Successfully!')
except Exception as e:
    print(str(e))

Getting Data...
Data Loaded Successfully!


**Checking for missing values**

In [3]:
df_train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age                32561 non-null int64
workclass          32561 non-null object
fnlwgt             32561 non-null int64
education          32561 non-null object
edu_num            32561 non-null int64
marital_status     32561 non-null object
occupation         32561 non-null object
relationship       32561 non-null object
race               32561 non-null object
sex                32561 non-null object
cap_gain           32561 non-null int64
cap_loss           32561 non-null int64
work_hrs_weekly    32561 non-null int64
country            32561 non-null object
income             32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
df_train_raw.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
edu_num            0
marital_status     0
occupation         0
relationship       0
race               0
sex                0
cap_gain           0
cap_loss           0
work_hrs_weekly    0
country            0
income             0
dtype: int64

**Preprocess Data**

In [6]:
def convert_to_dummies(X):
    
    #Initialize Empty DataFrame
    df_preprocess = pd.DataFrame(index=X.index)
    
    #Identify Categorical Variables
    cols_interest = [c for c in X.columns if 'income' not in c]
    
    #Iterate over Columns and Convert to Dummy Variables
    for c,r in X[cols_interest].iteritems():
        if r.dtype == object:
            r = pd.get_dummies(r,prefix=c)
        #join to one dataframe
        df_preprocess = df_preprocess.join(r)
    
    #Add Target Variable back to DataFrame
    df_out = pd.concat([X['income'],df_preprocess],axis=1)
    return(df_out)

In [7]:
def preprocess(df_train,df_test):
    
    #Missing values are denoted by '?' - replace with the missing data with NaNs
    def replace_nans(df):
        for c in df.columns:
            df[c] = df[c].replace(to_replace=' ?',value=np.nan)
        return(df)
            
    df_train = replace_nans(df_train).dropna().reset_index(drop=True)  
    df_test = replace_nans(df_test).dropna().reset_index(drop=True)
    
    #Remove ending '.' for test.income 
    try:
        df_test['income'] = df_test['income'].apply(lambda x: x.strip('.'))
    except:
        pass
    
    #Remove any whitespace
    cat_variables = [c for c,r in df_train.iteritems() if r.dtype == object]
    for col in cat_variables:
        df_train[col] = df_train[col].str.strip()
        df_test[col] = df_test[col].str.strip()
    
    #Standardize Numerical Values
    num_variables = [c for c,r in df_train.iteritems() if r.dtype == int and c not in ['income']]
    for c in num_variables:
        std = MinMaxScaler().fit(df_train[c].astype(np.float32).values.reshape(-1,1))
        df_train[c] = std.transform(df_train[c].astype(np.float32).values.reshape(-1,1))
        df_test[c] = std.transform(df_test[c].astype(np.float32).values.reshape(-1,1))
        
    #One-Hot Encoding for Categorical Variables
    df_train = convert_to_dummies(df_train)
    df_test = convert_to_dummies(df_test)
                       
    #Setup encoder for 'income' variable
    tmp = LabelEncoder()
    df_train['income'] = tmp.fit_transform(df_train['income'])
    df_test['income'] = tmp.transform(df_test['income'])
    
    return(df_train,df_test)
    

In [8]:
df_train,df_test = preprocess(df_train_raw,df_test_raw)

### Build Nerual Network using TensorFlow

In [9]:
import tensorflow as tf #entire api
from tensorflow.contrib import learn #get access to many wrappers
from tensorflow.contrib import layers
from tensorflow.contrib.learn import SKCompat #similar interfaces used in sklearn
tf.logging.set_verbosity(tf.logging.WARN) # control the verbosity of tensor flow
from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib

In [30]:
def Nerual_Network(features, targets, mode):

    """ Build Deep Neural Net -- return predictions, loss, training_op
    
    Keyword arguments:
    
    dict_features: input features
    target: target features
    mode: 
    """
    
    # =====SETUP ARCHITECTURE=====
    #One hidden layer with RELU activation
    features = layers.relu(features, num_outputs=50) 
    #Fully Connected Layer with 1 output
    features = layers.fully_connected(features, num_outputs=100) 
    #Second Fully Connected Layer
    features = layers.fully_connected(features, num_outputs=1) 
    #Pass through a sigmoid activation
    output_layer = tf.sigmoid(features) 
    #Reshape the output to be one dimensional
    predictions = tf.reshape(output_layer, [-1])

    loss_mse = None
    train_op = None
    
    # Calculate Loss
    if mode != learn.ModeKeys.INFER:
        # =====LOSS=======
        #using MSE as loss function
        loss_mse = tf.losses.mean_squared_error(targets, predictions) 
    
    if mode == learn.ModeKeys.TRAIN:
        # =====OPTIMIZER PARAMS========
        train_op = layers.optimize_loss(
            loss=loss_mse, 
            global_step=tf.contrib.framework.get_global_step(),
            optimizer='Adagrad', 
            learning_rate=0.1)
    
    #Prediction Threshold
    predictions_out = predictions>0.5
    
    model = model_fn_lib.ModelFnOps(mode=mode, predictions={ 
        'incomes':predictions_out}, loss=loss_mse, train_op=train_op)
    
    return(model)

**Fit Model**

In [31]:
features = df_train.ix[:,1:].astype(np.float32).values
target = df_train.ix[:,0].astype(np.float32).values

In [32]:
%%time 
clf = learn.Estimator(model_fn=Nerual_Network)
clf.fit(features,target,steps=500)

Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
CPU times: user 2min 36s, sys: 10.4 s, total: 2min 47s
Wall time: 1min


**Predictions**

In [28]:
l = df_test.astype(np.float32).values
y_test = df_test['income'].values

In [29]:
yhat = clf.predict(l)
yhat = [x['incomes'] for x in yhat]
print(confusion_matrix(y_test,yhat),accuracy_score(y_test,yhat))

Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
[[10752   608]
 [ 3601    99]] 0.720517928287
